diff options
Diffstat (limited to 'net')
298 files changed, 5406 insertions, 2830 deletions
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index e116d308a8df..37eaff3f7b69 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -1086,12 +1086,12 @@ static u8 lowpan_iphc_mcast_ctx_addr_compress(u8 **hc_ptr, const struct lowpan_iphc_ctx *ctx, const struct in6_addr *ipaddr) { - u8 data[6]; + u8 data[6] = {}; /* flags/scope, reserved (RIID) */ memcpy(data, &ipaddr->s6_addr[1], 2); /* group ID */ - memcpy(&data[1], &ipaddr->s6_addr[11], 4); + memcpy(&data[2], &ipaddr->s6_addr[12], 4); lowpan_push_hc_data(hc_ptr, data, 6); return LOWPAN_IPHC_DAM_00; diff --git a/net/802/garp.c b/net/802/garp.c index 6f563b6797d9..c7a39f298ad6 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -453,7 +453,7 @@ static int garp_pdu_parse_attr(struct garp_applicant *app, struct sk_buff *skb, if (!pskb_may_pull(skb, ga->len)) return -1; skb_pull(skb, ga->len); - dlen = sizeof(*ga) - ga->len; + dlen = ga->len - sizeof(*ga); if (attrtype > app->app->maxattr) return 0; diff --git a/net/802/mrp.c b/net/802/mrp.c index ff0e80574e6b..160a3b14569c 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -703,6 +703,12 @@ static int mrp_pdu_parse_vecattr(struct mrp_applicant *app, valen = be16_to_cpu(get_unaligned(&mrp_cb(skb)->vah->lenflags) & MRP_VECATTR_HDR_LEN_MASK); + /* If valen is 0, only a LeaveAllEvent is present; FirstValue and + * Vector fields are absent per IEEE 802.1ak. + */ + if (valen == 0) + return 0; + /* The VectorAttribute structure in a PDU carries event information * about one or more attributes having consecutive values. Only the * value for the first attribute is contained in the structure. So @@ -753,6 +759,9 @@ static int mrp_pdu_parse_vecattr(struct mrp_applicant *app, vaevents %= __MRP_VECATTR_EVENT_MAX; vaevent = vaevents; mrp_pdu_parse_vecattr_event(app, skb, vaevent); + valen--; + mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, + mrp_cb(skb)->mh->attrlen); } return 0; } diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index e7315c01a299..078fb7a6efa5 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -393,7 +393,7 @@ static void aarp_purge(void) */ static struct aarp_entry *aarp_alloc(void) { - struct aarp_entry *a = kmalloc_obj(*a, GFP_ATOMIC); + struct aarp_entry *a = kzalloc_obj(*a, GFP_ATOMIC); if (!a) return NULL; @@ -542,6 +542,11 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb, struct ddpehdr *ddp = (struct ddpehdr *)skb->data; int ft = 2; + if (!at) { + kfree_skb(skb); + return NET_XMIT_DROP; + } + /* * Compressible ? * diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 358fbe5e4d1d..b991d937205a 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -179,6 +179,7 @@ as_indicate_complete: break; default: pr_alert("bad message type %d\n", (int)msg->type); + dev_kfree_skb(skb); /* Paired with find_get_vcc(msg->vcc) above */ sock_put(sk); return -EINVAL; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f28e9cbf8ad5..b8b1b997960a 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -173,19 +173,12 @@ free_orig_node_hash: static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh) + struct batadv_orig_node *orig_node) { struct batadv_neigh_node *neigh_node; neigh_node = batadv_neigh_node_get_or_create(orig_node, hard_iface, neigh_addr); - if (!neigh_node) - goto out; - - neigh_node->orig_node = orig_neigh; - -out: return neigh_node; } @@ -231,6 +224,8 @@ static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) hard_iface->bat_iv.ogm_buff = NULL; mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); + + cancel_delayed_work_sync(&hard_iface->bat_iv.reschedule_work); } static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface) @@ -335,7 +330,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); const char *fwd_str; u8 packet_num; - s16 buff_pos; + int buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; struct sk_buff *skb; u8 *packet_pos; @@ -543,8 +538,10 @@ out: * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm + * + * Return: whether forward packet was scheduled */ -static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, +static bool batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, int packet_len, unsigned long send_time, bool direct_link, struct batadv_hard_iface *if_incoming, @@ -568,13 +565,13 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, skb = netdev_alloc_skb_ip_align(NULL, skb_size); if (!skb) - return; + return false; forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing, queue_left, bat_priv, skb); if (!forw_packet_aggr) { kfree_skb(skb); - return; + return false; } forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; @@ -597,6 +594,8 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, batadv_iv_send_outstanding_bat_ogm_packet); batadv_forw_packet_ogmv1_queue(bat_priv, forw_packet_aggr, send_time); + + return true; } /* aggregate a new packet into the existing ogm packet */ @@ -624,8 +623,10 @@ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr, * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm * @send_time: timestamp (jiffies) when the packet is to be sent + * + * Return: whether forward packet was scheduled */ -static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, +static bool batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, unsigned char *packet_buff, int packet_len, struct batadv_hard_iface *if_incoming, @@ -677,14 +678,16 @@ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, if (!own_packet && atomic_read(&bat_priv->aggregated_ogms)) send_time += max_aggregation_jiffies; - batadv_iv_ogm_aggregate_new(packet_buff, packet_len, - send_time, direct_link, - if_incoming, if_outgoing, - own_packet); + return batadv_iv_ogm_aggregate_new(packet_buff, packet_len, + send_time, direct_link, + if_incoming, if_outgoing, + own_packet); } else { batadv_iv_ogm_aggregate(forw_packet_aggr, packet_buff, packet_len, direct_link); spin_unlock_bh(&bat_priv->forw_bat_list_lock); + + return true; } } @@ -797,6 +800,9 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) u32 seqno; u16 tvlv_len = 0; unsigned long send_time; + bool reschedule = false; + bool scheduled; + int ret; lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex); @@ -820,9 +826,15 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); - tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff, - ogm_buff_len, - BATADV_OGM_HLEN); + ret = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff, + ogm_buff_len, + BATADV_OGM_HLEN); + if (ret < 0) { + reschedule = true; + goto out; + } + + tvlv_len = ret; } batadv_ogm_packet = (struct batadv_ogm_packet *)(*ogm_buff); @@ -841,8 +853,11 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) /* OGMs from secondary interfaces are only scheduled on their * respective interfaces. */ - batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, - hard_iface, hard_iface, 1, send_time); + scheduled = batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, + hard_iface, hard_iface, 1, send_time); + if (!scheduled) + reschedule = true; + goto out; } @@ -854,15 +869,28 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) if (!kref_get_unless_zero(&tmp_hard_iface->refcount)) continue; - batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, - *ogm_buff_len, hard_iface, - tmp_hard_iface, 1, send_time); - + scheduled = batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, + *ogm_buff_len, hard_iface, + tmp_hard_iface, 1, send_time); batadv_hardif_put(tmp_hard_iface); + + if (!scheduled && tmp_hard_iface == hard_iface) + reschedule = true; } rcu_read_unlock(); out: + if (reschedule) { + /* there was a failure scheduling the own forward packet. + * as result, the batadv_iv_send_outstanding_bat_ogm_packet() + * work item is no longer scheduled. it is therefore necessary + * to reschedule it manually + */ + queue_delayed_work(batadv_event_workqueue, + &hard_iface->bat_iv.reschedule_work, + msecs_to_jiffies(atomic_read(&bat_priv->orig_interval))); + } + batadv_hardif_put(primary_if); } @@ -877,6 +905,17 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } +static void batadv_iv_ogm_reschedule(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct batadv_hard_iface *hard_iface; + + hard_iface = container_of(delayed_work, + struct batadv_hard_iface, + bat_iv.reschedule_work); + batadv_iv_ogm_schedule(hard_iface); +} + /** * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface * @orig_node: originator which reproadcasted the OGMs directly @@ -907,6 +946,31 @@ static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node, } /** + * batadv_iv_ogm_neigh_ifinfo_sum() - Get bcast_own sum for a last-hop neighbor + * @bat_priv: the bat priv with all the mesh interface information + * @neigh_node: last-hop neighbor of an originator + * + * Return: Number of replied (rebroadcasted) OGMs for the originator currently + * announced by the neighbor. Returns 0 if the neighbor's originator entry is + * not available anymore. + */ +static u8 batadv_iv_ogm_neigh_ifinfo_sum(struct batadv_priv *bat_priv, + const struct batadv_neigh_node *neigh_node) +{ + struct batadv_orig_node *orig_neigh; + u8 sum; + + orig_neigh = batadv_orig_hash_find(bat_priv, neigh_node->addr); + if (!orig_neigh) + return 0; + + sum = batadv_iv_orig_ifinfo_sum(orig_neigh, neigh_node->if_incoming); + batadv_orig_node_put(orig_neigh); + + return sum; +} + +/** * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator * @bat_priv: the bat priv with all the mesh interface information @@ -975,17 +1039,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, } if (!neigh_node) { - struct batadv_orig_node *orig_tmp; - - orig_tmp = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); - if (!orig_tmp) - goto unlock; - neigh_node = batadv_iv_ogm_neigh_new(if_incoming, ethhdr->h_source, - orig_node, orig_tmp); - - batadv_orig_node_put(orig_tmp); + orig_node); if (!neigh_node) goto unlock; } else { @@ -1037,10 +1093,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, */ if (router_ifinfo && neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { - sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node, - router->if_incoming); - sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node, - neigh_node->if_incoming); + sum_orig = batadv_iv_ogm_neigh_ifinfo_sum(bat_priv, router); + sum_neigh = batadv_iv_ogm_neigh_ifinfo_sum(bat_priv, + neigh_node); if (sum_orig >= sum_neigh) goto out; } @@ -1106,7 +1161,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, if (!neigh_node) neigh_node = batadv_iv_ogm_neigh_new(if_incoming, orig_neigh_node->orig, - orig_neigh_node, orig_neigh_node); if (!neigh_node) @@ -1303,6 +1357,32 @@ out: } /** + * batadv_orig_to_direct_router() - get direct next hop neighbor to an orig address + * @bat_priv: the bat priv with all the mesh interface information + * @orig_addr: the originator MAC address to search the best next hop router for + * @if_outgoing: the interface where the OGM should be sent to + * + * Return: A neighbor node which is the best router towards the given originator + * address. Bonding candidates are ignored. + */ +static struct batadv_neigh_node * +batadv_orig_to_direct_router(struct batadv_priv *bat_priv, u8 *orig_addr, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_node *neigh_node; + struct batadv_orig_node *orig_node; + + orig_node = batadv_orig_hash_find(bat_priv, orig_addr); + if (!orig_node) + return NULL; + + neigh_node = batadv_orig_router_get(orig_node, if_outgoing); + batadv_orig_node_put(orig_node); + + return neigh_node; +} + +/** * batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing * interface * @skb: the skb containing the OGM @@ -1372,8 +1452,9 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, router = batadv_orig_router_get(orig_node, if_outgoing); if (router) { - router_router = batadv_orig_router_get(router->orig_node, - if_outgoing); + router_router = batadv_orig_to_direct_router(bat_priv, + router->addr, + if_outgoing); router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); } @@ -2227,6 +2308,8 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface) { + INIT_DELAYED_WORK(&hard_iface->bat_iv.reschedule_work, batadv_iv_ogm_reschedule); + /* begin scheduling originator messages on that interface */ batadv_iv_ogm_schedule(hard_iface); } diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index e3870492dab7..d66ca77b1aaa 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -113,14 +113,14 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) /** * batadv_v_ogm_send_to_if() - send a batman ogm using a given interface + * @bat_priv: the bat priv with all the mesh interface information * @skb: the OGM to send * @hard_iface: the interface to use to send the OGM */ -static void batadv_v_ogm_send_to_if(struct sk_buff *skb, +static void batadv_v_ogm_send_to_if(struct batadv_priv *bat_priv, + struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); - if (hard_iface->if_status != BATADV_IF_ACTIVE) { kfree_skb(skb); return; @@ -187,6 +187,7 @@ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) /** * batadv_v_ogm_aggr_send() - flush & send aggregation queue + * @bat_priv: the bat priv with all the mesh interface information * @hard_iface: the interface with the aggregation queue to flush * * Aggregates all OGMv2 packets currently in the aggregation queue into a @@ -196,7 +197,8 @@ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ -static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) +static void batadv_v_ogm_aggr_send(struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface) { unsigned int aggr_len = hard_iface->bat_v.aggr_len; struct sk_buff *skb_aggr; @@ -226,27 +228,32 @@ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) consume_skb(skb); } - batadv_v_ogm_send_to_if(skb_aggr, hard_iface); + batadv_v_ogm_send_to_if(bat_priv, skb_aggr, hard_iface); } /** * batadv_v_ogm_queue_on_if() - queue a batman ogm on a given interface + * @bat_priv: the bat priv with all the mesh interface information * @skb: the OGM to queue * @hard_iface: the interface to queue the OGM on */ -static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, +static void batadv_v_ogm_queue_on_if(struct batadv_priv *bat_priv, + struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); + if (hard_iface->mesh_iface != bat_priv->mesh_iface) { + kfree_skb(skb); + return; + } if (!atomic_read(&bat_priv->aggregated_ogms)) { - batadv_v_ogm_send_to_if(skb, hard_iface); + batadv_v_ogm_send_to_if(bat_priv, skb, hard_iface); return; } spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); if (!batadv_v_ogm_queue_left(skb, hard_iface)) - batadv_v_ogm_aggr_send(hard_iface); + batadv_v_ogm_aggr_send(bat_priv, hard_iface); hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb); __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); @@ -262,10 +269,10 @@ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; struct sk_buff *skb, *skb_tmp; - unsigned char *ogm_buff; + unsigned char **ogm_buff; struct list_head *iter; - int ogm_buff_len; - u16 tvlv_len = 0; + int *ogm_buff_len; + u16 tvlv_len; int ret; lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex); @@ -273,25 +280,27 @@ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) goto out; - ogm_buff = bat_priv->bat_v.ogm_buff; - ogm_buff_len = bat_priv->bat_v.ogm_buff_len; + ogm_buff = &bat_priv->bat_v.ogm_buff; + ogm_buff_len = &bat_priv->bat_v.ogm_buff_len; + /* tt changes have to be committed before the tvlv data is * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); - tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff, - &ogm_buff_len, - BATADV_OGM2_HLEN); + ret = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff, + ogm_buff_len, + BATADV_OGM2_HLEN); + if (ret < 0) + goto reschedule; - bat_priv->bat_v.ogm_buff = ogm_buff; - bat_priv->bat_v.ogm_buff_len = ogm_buff_len; + tvlv_len = ret; - skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len); + skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + *ogm_buff_len); if (!skb) goto reschedule; skb_reserve(skb, ETH_HLEN); - skb_put_data(skb, ogm_buff, ogm_buff_len); + skb_put_data(skb, *ogm_buff, *ogm_buff_len); ogm_packet = (struct batadv_ogm2_packet *)skb->data; ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno)); @@ -343,7 +352,7 @@ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) break; } - batadv_v_ogm_queue_on_if(skb_tmp, hard_iface); + batadv_v_ogm_queue_on_if(bat_priv, skb_tmp, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); @@ -383,12 +392,14 @@ void batadv_v_ogm_aggr_work(struct work_struct *work) { struct batadv_hard_iface_bat_v *batv; struct batadv_hard_iface *hard_iface; + struct batadv_priv *bat_priv; batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work); hard_iface = container_of(batv, struct batadv_hard_iface, bat_v); + bat_priv = netdev_priv(hard_iface->mesh_iface); spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); - batadv_v_ogm_aggr_send(hard_iface); + batadv_v_ogm_aggr_send(bat_priv, hard_iface); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_start_queue_timer(hard_iface); @@ -578,7 +589,7 @@ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, if_outgoing->net_dev->name, ntohl(ogm_forward->throughput), ogm_forward->ttl, if_incoming->net_dev->name); - batadv_v_ogm_queue_on_if(skb, if_outgoing); + batadv_v_ogm_queue_on_if(bat_priv, skb, if_outgoing); out: batadv_orig_ifinfo_put(orig_ifinfo); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 51fe028b9088..ffe854018bd3 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -318,8 +318,8 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) if (claim->backbone_gw != backbone_gw) continue; - batadv_claim_put(claim); hlist_del_rcu(&claim->hash_entry); + batadv_claim_put(claim); } spin_unlock_bh(list_lock); } @@ -356,12 +356,14 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, sizeof(local_claim_dest)); local_claim_dest.type = claimtype; - mesh_iface = primary_if->mesh_iface; + mesh_iface = READ_ONCE(primary_if->mesh_iface); + if (!mesh_iface) + goto out; skb = arp_create(ARPOP_REPLY, ETH_P_ARP, /* IP DST: 0.0.0.0 */ zeroip, - primary_if->mesh_iface, + mesh_iface, /* IP SRC: 0.0.0.0 */ zeroip, /* Ethernet DST: Broadcast */ @@ -514,8 +516,8 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig, entry->crc = BATADV_BLA_CRC_INIT; entry->bat_priv = bat_priv; spin_lock_init(&entry->crc_lock); - atomic_set(&entry->request_sent, 0); - atomic_set(&entry->wait_periods, 0); + entry->state = BATADV_BLA_BACKBONE_GW_SYNCED; + entry->wait_periods = 0; ether_addr_copy(entry->orig, orig); INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report); kref_init(&entry->refcount); @@ -544,9 +546,13 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig, batadv_bla_send_announce(bat_priv, entry); /* this will be decreased in the worker thread */ - atomic_inc(&entry->request_sent); - atomic_set(&entry->wait_periods, BATADV_BLA_WAIT_PERIODS); - atomic_inc(&bat_priv->bla.num_requests); + spin_lock_bh(&bat_priv->bla.num_requests_lock); + if (entry->state == BATADV_BLA_BACKBONE_GW_SYNCED) { + entry->state = BATADV_BLA_BACKBONE_GW_UNSYNCED; + entry->wait_periods = BATADV_BLA_WAIT_PERIODS; + atomic_inc(&bat_priv->bla.num_requests); + } + spin_unlock_bh(&bat_priv->bla.num_requests_lock); } return entry; @@ -649,10 +655,12 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) backbone_gw->vid, BATADV_CLAIM_TYPE_REQUEST); /* no local broadcasts should be sent or received, for now. */ - if (!atomic_read(&backbone_gw->request_sent)) { + spin_lock_bh(&backbone_gw->bat_priv->bla.num_requests_lock); + if (backbone_gw->state == BATADV_BLA_BACKBONE_GW_SYNCED) { + backbone_gw->state = BATADV_BLA_BACKBONE_GW_UNSYNCED; atomic_inc(&backbone_gw->bat_priv->bla.num_requests); - atomic_set(&backbone_gw->request_sent, 1); } + spin_unlock_bh(&backbone_gw->bat_priv->bla.num_requests_lock); } /** @@ -723,6 +731,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, if (unlikely(hash_added != 0)) { /* only local changes happened. */ + batadv_backbone_gw_put(backbone_gw); kfree(claim); return; } @@ -872,10 +881,12 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, /* if we have sent a request and the crc was OK, * we can allow traffic again. */ - if (atomic_read(&backbone_gw->request_sent)) { + spin_lock_bh(&bat_priv->bla.num_requests_lock); + if (backbone_gw->state == BATADV_BLA_BACKBONE_GW_UNSYNCED) { + backbone_gw->state = BATADV_BLA_BACKBONE_GW_SYNCED; atomic_dec(&backbone_gw->bat_priv->bla.num_requests); - atomic_set(&backbone_gw->request_sent, 0); } + spin_unlock_bh(&bat_priv->bla.num_requests_lock); } batadv_backbone_gw_put(backbone_gw); @@ -1223,6 +1234,7 @@ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) struct hlist_head *head; struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ + bool purged; int i; hash = bat_priv->bla.backbone_hash; @@ -1233,30 +1245,49 @@ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) head = &hash->table[i]; list_lock = &hash->list_locks[i]; - spin_lock_bh(list_lock); - hlist_for_each_entry_safe(backbone_gw, node_tmp, - head, hash_entry) { - if (now) - goto purge_now; - if (!batadv_has_timed_out(backbone_gw->lasttime, - BATADV_BLA_BACKBONE_TIMEOUT)) - continue; + do { + purged = false; + + spin_lock_bh(list_lock); + hlist_for_each_entry_safe(backbone_gw, node_tmp, + head, hash_entry) { + if (now) + goto purge_now; + if (!batadv_has_timed_out(backbone_gw->lasttime, + BATADV_BLA_BACKBONE_TIMEOUT)) + continue; - batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, - "%s(): backbone gw %pM timed out\n", - __func__, backbone_gw->orig); + batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, + "%s(): backbone gw %pM timed out\n", + __func__, backbone_gw->orig); purge_now: - /* don't wait for the pending request anymore */ - if (atomic_read(&backbone_gw->request_sent)) - atomic_dec(&bat_priv->bla.num_requests); + purged = true; - batadv_bla_del_backbone_claims(backbone_gw); + /* don't wait for the pending request anymore */ + spin_lock_bh(&bat_priv->bla.num_requests_lock); + if (backbone_gw->state == BATADV_BLA_BACKBONE_GW_UNSYNCED) + atomic_dec(&bat_priv->bla.num_requests); - hlist_del_rcu(&backbone_gw->hash_entry); - batadv_backbone_gw_put(backbone_gw); - } - spin_unlock_bh(list_lock); + backbone_gw->state = BATADV_BLA_BACKBONE_GW_STOPPED; + spin_unlock_bh(&bat_priv->bla.num_requests_lock); + + batadv_bla_del_backbone_claims(backbone_gw); + + hlist_del_rcu(&backbone_gw->hash_entry); + break; + } + spin_unlock_bh(list_lock); + + if (purged) { + /* reference for pending report_work */ + if (cancel_work_sync(&backbone_gw->report_work)) + batadv_backbone_gw_put(backbone_gw); + + /* reference for hash_entry */ + batadv_backbone_gw_put(backbone_gw); + } + } while (purged); } } @@ -1288,6 +1319,13 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { + /* only purge claims not currently in the process of being released. + * Such claims could otherwise have a NULL-ptr backbone_gw set because + * they already went through batadv_claim_release() + */ + if (!kref_get_unless_zero(&claim->refcount)) + continue; + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); if (now) goto purge_now; @@ -1313,6 +1351,7 @@ purge_now: claim->addr, claim->vid); skip: batadv_backbone_gw_put(backbone_gw); + batadv_claim_put(claim); } rcu_read_unlock(); } @@ -1483,7 +1522,7 @@ static void batadv_bla_periodic_work(struct work_struct *work) batadv_bla_send_loopdetect(bat_priv, backbone_gw); - /* request_sent is only set after creation to avoid + /* state is only set to unsynced after creation to avoid * problems when we are not yet known as backbone gw * in the backbone. * @@ -1492,14 +1531,21 @@ static void batadv_bla_periodic_work(struct work_struct *work) * some grace time. */ - if (atomic_read(&backbone_gw->request_sent) == 0) - continue; + spin_lock_bh(&bat_priv->bla.num_requests_lock); + if (backbone_gw->state != BATADV_BLA_BACKBONE_GW_UNSYNCED) + goto unlock_next; - if (!atomic_dec_and_test(&backbone_gw->wait_periods)) - continue; + if (backbone_gw->wait_periods > 0) + backbone_gw->wait_periods--; + + if (backbone_gw->wait_periods > 0) + goto unlock_next; + backbone_gw->state = BATADV_BLA_BACKBONE_GW_SYNCED; atomic_dec(&backbone_gw->bat_priv->bla.num_requests); - atomic_set(&backbone_gw->request_sent, 0); + +unlock_next: + spin_unlock_bh(&bat_priv->bla.num_requests_lock); } rcu_read_unlock(); } diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 3efc4cf50b46..0a8bd95e2f99 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -696,6 +696,9 @@ static bool batadv_dat_forward_data(struct batadv_priv *bat_priv, goto free_orig; tmp_skb = pskb_copy_for_clone(skb, GFP_ATOMIC); + if (!tmp_skb) + goto free_neigh; + if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, tmp_skb, cand[i].orig_node, packet_subtype)) { diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index f4e45cc25816..e9553db42349 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -17,6 +17,7 @@ #include <linux/lockdep.h> #include <linux/minmax.h> #include <linux/netdevice.h> +#include <linux/overflow.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -80,9 +81,9 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node, * * Return: the maximum size of payload that can be fragmented. */ -static int batadv_frag_size_limit(void) +static size_t batadv_frag_size_limit(void) { - int limit = BATADV_FRAG_MAX_FRAG_SIZE; + size_t limit = BATADV_FRAG_MAX_FRAG_SIZE; limit -= sizeof(struct batadv_frag_packet); limit *= BATADV_FRAG_MAX_FRAGMENTS; @@ -143,7 +144,9 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, struct batadv_frag_packet *frag_packet; u8 bucket; u16 seqno, hdr_size = sizeof(struct batadv_frag_packet); + bool overflow = false; bool ret = false; + size_t data_len; /* Linearize packet to avoid linearizing 16 packets in a row when doing * the later merge. Non-linear merge should be added to remove this @@ -153,6 +156,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, goto err; frag_packet = (struct batadv_frag_packet *)skb->data; + data_len = skb->len - hdr_size; seqno = ntohs(frag_packet->seqno); bucket = seqno % BATADV_FRAG_BUFFER_COUNT; @@ -171,7 +175,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, spin_lock_bh(&chain->lock); if (batadv_frag_init_chain(chain, seqno)) { hlist_add_head(&frag_entry_new->list, &chain->fragment_list); - chain->size = skb->len - hdr_size; + chain->size = data_len; chain->timestamp = jiffies; chain->total_size = ntohs(frag_packet->total_size); ret = true; @@ -188,7 +192,11 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, if (frag_entry_curr->no < frag_entry_new->no) { hlist_add_before(&frag_entry_new->list, &frag_entry_curr->list); - chain->size += skb->len - hdr_size; + + if (check_add_overflow(chain->size, data_len, + &chain->size)) + overflow = true; + chain->timestamp = jiffies; ret = true; goto out; @@ -201,13 +209,16 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, /* Reached the end of the list, so insert after 'frag_entry_last'. */ if (likely(frag_entry_last)) { hlist_add_behind(&frag_entry_new->list, &frag_entry_last->list); - chain->size += skb->len - hdr_size; + + if (check_add_overflow(chain->size, data_len, &chain->size)) + overflow = true; + chain->timestamp = jiffies; ret = true; } out: - if (chain->size > batadv_frag_size_limit() || + if (overflow || chain->size > batadv_frag_size_limit() || chain->total_size != ntohs(frag_packet->total_size) || chain->total_size > batadv_frag_size_limit()) { /* Clear chain if total size of either the list or the packet @@ -294,6 +305,31 @@ free: } /** + * batadv_skb_is_frag() - check if newly merged skb contains unicast fragment + * @skb: newly merged skb + * + * Return: if newly merged skb is of type BATADV_UNICAST_FRAG + */ +static bool batadv_skb_is_frag(struct sk_buff *skb) +{ + struct batadv_ogm_packet *batadv_ogm_packet; + + /* packet should hold at least type and version */ + if (unlikely(!pskb_may_pull(skb, 2))) + return false; + + batadv_ogm_packet = (struct batadv_ogm_packet *)skb->data; + + if (batadv_ogm_packet->version != BATADV_COMPAT_VERSION) + return false; + + if (batadv_ogm_packet->packet_type != BATADV_UNICAST_FRAG) + return false; + + return true; +} + +/** * batadv_frag_skb_buffer() - buffer fragment for later merge * @skb: skb to buffer * @orig_node_src: originator that the skb is received from @@ -326,6 +362,16 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb, if (!skb_out) goto out_err; + /* fragment in fragment is not allowed. otherwise it is possible + * to exhaust the stack when receiving a matryoshka-style + * "fragments in a fragment packet" + */ + if (batadv_skb_is_frag(skb_out)) { + kfree_skb(skb_out); + skb_out = NULL; + goto out_err; + } + out: ret = true; out_err: diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 51e9c081a2a4..a9d0346e8332 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -478,10 +478,14 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv, */ void batadv_gw_node_free(struct batadv_priv *bat_priv) { + struct batadv_gw_node *curr_gw; struct batadv_gw_node *gw_node; struct hlist_node *node_tmp; spin_lock_bh(&bat_priv->gw.list_lock); + curr_gw = rcu_replace_pointer(bat_priv->gw.curr_gw, NULL, true); + batadv_gw_node_put(curr_gw); + hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.gateway_list, list) { hlist_del_init_rcu(&gw_node->list); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 3a35aadd8b41..a4d33ee0fda5 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -249,6 +249,7 @@ void batadv_mesh_free(struct net_device *mesh_iface) atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); batadv_purge_outstanding_packets(bat_priv, NULL); + batadv_tp_stop_all(bat_priv); batadv_gw_node_free(bat_priv); diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c index 56ca1c1b83f2..e7aa45bc6b7a 100644 --- a/net/batman-adv/mesh-interface.c +++ b/net/batman-adv/mesh-interface.c @@ -787,6 +787,7 @@ static int batadv_meshif_init_late(struct net_device *dev) atomic_set(&bat_priv->tt.ogm_append_cnt, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); + spin_lock_init(&bat_priv->bla.num_requests_lock); #endif atomic_set(&bat_priv->tp_num, 0); diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index b3468ccab535..ad4921b659d9 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -835,8 +835,6 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) orig_node = container_of(rcu, struct batadv_orig_node, rcu); - batadv_mcast_purge_orig(orig_node); - batadv_frag_purge_orig(orig_node, NULL); kfree(orig_node->tt_buff); @@ -887,6 +885,8 @@ void batadv_orig_node_release(struct kref *ref) } spin_unlock_bh(&orig_node->vlan_list_lock); + batadv_mcast_purge_orig(orig_node); + call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 2e42f6b348c8..0fc4ca78e84e 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -8,10 +8,12 @@ #include "main.h" #include <linux/atomic.h> +#include <linux/bug.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> +#include <linux/completion.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/etherdevice.h> @@ -253,6 +255,7 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, * batadv_tp_list_find() - find a tp_vars object in the global list * @bat_priv: the bat priv with all the mesh interface information * @dst: the other endpoint MAC address to look for + * @role: role of the session * * Look for a tp_vars object matching dst as end_point and return it after * having increment the refcounter. Return NULL is not found @@ -260,7 +263,8 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, * Return: matching tp_vars or NULL when no tp_vars with @dst was found */ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, - const u8 *dst) + const u8 *dst, + enum batadv_tp_meter_role role) { struct batadv_tp_vars *pos, *tp_vars = NULL; @@ -269,6 +273,9 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, if (!batadv_compare_eth(pos->other_end, dst)) continue; + if (pos->role != role) + continue; + /* most of the time this function is invoked during the normal * process..it makes sens to pay more when the session is * finished and to speed the process up during the measurement @@ -285,11 +292,32 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, } /** + * batadv_tp_list_active() - check if session from/to destination is ongoing + * @bat_priv: the bat priv with all the mesh interface information + * @dst: the other endpoint MAC address to look for + * + * Return: if matching session with @dst was found + */ +static bool batadv_tp_list_active(struct batadv_priv *bat_priv, const u8 *dst) + __must_hold(&bat_priv->tp_list_lock) +{ + struct batadv_tp_vars *tp_vars; + + hlist_for_each_entry_rcu(tp_vars, &bat_priv->tp_list, list) { + if (batadv_compare_eth(tp_vars->other_end, dst)) + return true; + } + + return false; +} + +/** * batadv_tp_list_find_session() - find tp_vars session object in the global * list * @bat_priv: the bat priv with all the mesh interface information * @dst: the other endpoint MAC address to look for * @session: session identifier + * @role: role of the session * * Look for a tp_vars object matching dst as end_point, session as tp meter * session and return it after having increment the refcounter. Return NULL @@ -299,7 +327,7 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, */ static struct batadv_tp_vars * batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst, - const u8 *session) + const u8 *session, enum batadv_tp_meter_role role) { struct batadv_tp_vars *pos, *tp_vars = NULL; @@ -311,6 +339,9 @@ batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst, if (memcmp(pos->session, session, sizeof(pos->session)) != 0) continue; + if (pos->role != role) + continue; + /* most of the time this function is invoked during the normal * process..it makes sense to pay more when the session is * finished and to speed the process up during the measurement @@ -365,32 +396,41 @@ static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars) } /** - * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer - * @bat_priv: the bat priv with all the mesh interface information - * @tp_vars: the private data of the current TP meter session to cleanup + * batadv_tp_list_detach() - remove tp session from mesh session list once + * @tp_vars: the private data of the current TP meter session */ -static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, - struct batadv_tp_vars *tp_vars) +static void batadv_tp_list_detach(struct batadv_tp_vars *tp_vars) { - cancel_delayed_work(&tp_vars->finish_work); + bool detached = false; spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); - hlist_del_rcu(&tp_vars->list); + if (!hlist_unhashed(&tp_vars->list)) { + hlist_del_init_rcu(&tp_vars->list); + detached = true; + } spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); + if (!detached) + return; + + atomic_dec(&tp_vars->bat_priv->tp_num); + /* drop list reference */ batadv_tp_vars_put(tp_vars); +} - atomic_dec(&tp_vars->bat_priv->tp_num); +/** + * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer + * @tp_vars: the private data of the current TP meter session to cleanup + */ +static void batadv_tp_sender_cleanup(struct batadv_tp_vars *tp_vars) +{ + cancel_delayed_work_sync(&tp_vars->finish_work); + + batadv_tp_list_detach(tp_vars); /* kill the timer and remove its reference */ - timer_delete_sync(&tp_vars->timer); - /* the worker might have rearmed itself therefore we kill it again. Note - * that if the worker should run again before invoking the following - * timer_delete(), it would not re-arm itself once again because the status - * is OFF now - */ - timer_delete(&tp_vars->timer); + timer_shutdown_sync(&tp_vars->timer); batadv_tp_vars_put(tp_vars); } @@ -402,11 +442,14 @@ static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, static void batadv_tp_sender_end(struct batadv_priv *bat_priv, struct batadv_tp_vars *tp_vars) { + enum batadv_tp_meter_reason reason; u32 session_cookie; + reason = atomic_read(&tp_vars->send_result); + batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Test towards %pM finished..shutting down (reason=%d)\n", - tp_vars->other_end, tp_vars->reason); + tp_vars->other_end, reason); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Last timing stats: SRTT=%ums RTTVAR=%ums RTO=%ums\n", @@ -419,7 +462,7 @@ static void batadv_tp_sender_end(struct batadv_priv *bat_priv, session_cookie = batadv_tp_session_cookie(tp_vars->session, tp_vars->icmp_uid); - batadv_tp_batctl_notify(tp_vars->reason, + batadv_tp_batctl_notify(reason, tp_vars->other_end, bat_priv, tp_vars->start_time, @@ -435,10 +478,18 @@ static void batadv_tp_sender_end(struct batadv_priv *bat_priv, static void batadv_tp_sender_shutdown(struct batadv_tp_vars *tp_vars, enum batadv_tp_meter_reason reason) { - if (!atomic_dec_and_test(&tp_vars->sending)) - return; + atomic_cmpxchg(&tp_vars->send_result, 0, reason); +} - tp_vars->reason = reason; +/** + * batadv_tp_sender_stopped() - check if tp session was stopped with reason + * @tp_vars: the private data of the current TP meter session + * + * Return: whether stop reason was found + */ +static bool batadv_tp_sender_stopped(struct batadv_tp_vars *tp_vars) +{ + return atomic_read(&tp_vars->send_result) != 0; } /** @@ -468,7 +519,7 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars) /* most of the time this function is invoked while normal packet * reception... */ - if (unlikely(atomic_read(&tp_vars->sending) == 0)) + if (unlikely(batadv_tp_sender_stopped(tp_vars))) /* timer ref will be dropped in batadv_tp_sender_cleanup */ return; @@ -488,7 +539,7 @@ static void batadv_tp_sender_timeout(struct timer_list *t) struct batadv_tp_vars *tp_vars = timer_container_of(tp_vars, t, timer); struct batadv_priv *bat_priv = tp_vars->bat_priv; - if (atomic_read(&tp_vars->sending) == 0) + if (batadv_tp_sender_stopped(tp_vars)) return; /* if the user waited long enough...shutdown the test */ @@ -643,11 +694,11 @@ static void batadv_tp_recv_ack(struct batadv_priv *bat_priv, /* find the tp_vars */ tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, - icmp->session); + icmp->session, BATADV_TP_SENDER); if (unlikely(!tp_vars)) return; - if (unlikely(atomic_read(&tp_vars->sending) == 0)) + if (unlikely(batadv_tp_sender_stopped(tp_vars))) goto out; /* old ACK? silently drop it.. */ @@ -813,21 +864,21 @@ static int batadv_tp_send(void *arg) if (unlikely(tp_vars->role != BATADV_TP_SENDER)) { err = BATADV_TP_REASON_DST_UNREACHABLE; - tp_vars->reason = err; + batadv_tp_sender_shutdown(tp_vars, err); goto out; } orig_node = batadv_orig_hash_find(bat_priv, tp_vars->other_end); if (unlikely(!orig_node)) { err = BATADV_TP_REASON_DST_UNREACHABLE; - tp_vars->reason = err; + batadv_tp_sender_shutdown(tp_vars, err); goto out; } primary_if = batadv_primary_if_get_selected(bat_priv); if (unlikely(!primary_if)) { err = BATADV_TP_REASON_DST_UNREACHABLE; - tp_vars->reason = err; + batadv_tp_sender_shutdown(tp_vars, err); goto out; } @@ -846,7 +897,7 @@ static int batadv_tp_send(void *arg) queue_delayed_work(batadv_event_workqueue, &tp_vars->finish_work, msecs_to_jiffies(tp_vars->test_length)); - while (atomic_read(&tp_vars->sending) != 0) { + while (!batadv_tp_sender_stopped(tp_vars)) { if (unlikely(!batadv_tp_avail(tp_vars, payload_len))) { batadv_tp_wait_available(tp_vars, payload_len); continue; @@ -869,8 +920,7 @@ static int batadv_tp_send(void *arg) "Meter: %s() cannot send packets (%d)\n", __func__, err); /* ensure nobody else tries to stop the thread now */ - if (atomic_dec_and_test(&tp_vars->sending)) - tp_vars->reason = err; + batadv_tp_sender_shutdown(tp_vars, err); break; } @@ -886,7 +936,8 @@ out: batadv_orig_node_put(orig_node); batadv_tp_sender_end(bat_priv, tp_vars); - batadv_tp_sender_cleanup(bat_priv, tp_vars); + batadv_tp_sender_cleanup(tp_vars); + complete(&tp_vars->finished); batadv_tp_vars_put(tp_vars); @@ -918,7 +969,8 @@ static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars) batadv_tp_vars_put(tp_vars); /* cleanup of failed tp meter variables */ - batadv_tp_sender_cleanup(bat_priv, tp_vars); + batadv_tp_sender_cleanup(tp_vars); + complete(&tp_vars->finished); return; } @@ -947,10 +999,15 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, /* look for an already existing test towards this node */ spin_lock_bh(&bat_priv->tp_list_lock); - tp_vars = batadv_tp_list_find(bat_priv, dst); - if (tp_vars) { + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) { + spin_unlock_bh(&bat_priv->tp_list_lock); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_DST_UNREACHABLE, + dst, bat_priv, session_cookie); + return; + } + + if (batadv_tp_list_active(bat_priv, dst)) { spin_unlock_bh(&bat_priv->tp_list_lock); - batadv_tp_vars_put(tp_vars); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: test to or from the same node already ongoing, aborting\n"); batadv_tp_batctl_error_notify(BATADV_TP_REASON_ALREADY_ONGOING, @@ -969,6 +1026,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); if (!tp_vars) { + atomic_dec(&bat_priv->tp_num); spin_unlock_bh(&bat_priv->tp_list_lock); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: %s cannot allocate list elements\n", @@ -982,7 +1040,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, ether_addr_copy(tp_vars->other_end, dst); kref_init(&tp_vars->refcount); tp_vars->role = BATADV_TP_SENDER; - atomic_set(&tp_vars->sending, 1); + atomic_set(&tp_vars->send_result, 0); memcpy(tp_vars->session, session_id, sizeof(session_id)); tp_vars->icmp_uid = icmp_uid; @@ -1017,6 +1075,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, tp_vars->start_time = jiffies; init_waitqueue_head(&tp_vars->more_bytes); + init_completion(&tp_vars->finished); spin_lock_init(&tp_vars->unacked_lock); INIT_LIST_HEAD(&tp_vars->unacked_list); @@ -1069,16 +1128,16 @@ void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst, if (!orig_node) return; - tp_vars = batadv_tp_list_find(bat_priv, orig_node->orig); + tp_vars = batadv_tp_list_find(bat_priv, orig_node->orig, BATADV_TP_SENDER); if (!tp_vars) { batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: trying to interrupt an already over connection\n"); - goto out; + goto out_put_orig_node; } batadv_tp_sender_shutdown(tp_vars, return_value); batadv_tp_vars_put(tp_vars); -out: +out_put_orig_node: batadv_orig_node_put(orig_node); } @@ -1119,14 +1178,7 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t) "Shutting down for inactivity (more than %dms) from %pM\n", BATADV_TP_RECV_TIMEOUT, tp_vars->other_end); - spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); - hlist_del_rcu(&tp_vars->list); - spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); - - /* drop list reference */ - batadv_tp_vars_put(tp_vars); - - atomic_dec(&bat_priv->tp_num); + batadv_tp_list_detach(tp_vars); spin_lock_bh(&tp_vars->unacked_lock); list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { @@ -1136,6 +1188,9 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t) spin_unlock_bh(&tp_vars->unacked_lock); /* drop reference of timer */ + if (WARN_ON(atomic_xchg(&tp_vars->receiving, 0) != 1)) + return; + batadv_tp_vars_put(tp_vars); } @@ -1329,11 +1384,14 @@ static struct batadv_tp_vars * batadv_tp_init_recv(struct batadv_priv *bat_priv, const struct batadv_icmp_tp_packet *icmp) { - struct batadv_tp_vars *tp_vars; + struct batadv_tp_vars *tp_vars = NULL; spin_lock_bh(&bat_priv->tp_list_lock); + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + goto out_unlock; + tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, - icmp->session); + icmp->session, BATADV_TP_RECEIVER); if (tp_vars) goto out_unlock; @@ -1344,11 +1402,14 @@ batadv_tp_init_recv(struct batadv_priv *bat_priv, } tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); - if (!tp_vars) + if (!tp_vars) { + atomic_dec(&bat_priv->tp_num); goto out_unlock; + } ether_addr_copy(tp_vars->other_end, icmp->orig); tp_vars->role = BATADV_TP_RECEIVER; + atomic_set(&tp_vars->receiving, 1); memcpy(tp_vars->session, icmp->session, sizeof(tp_vars->session)); tp_vars->last_recv = BATADV_TP_FIRST_SEQ; tp_vars->bat_priv = bat_priv; @@ -1401,7 +1462,7 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv, } } else { tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, - icmp->session); + icmp->session, BATADV_TP_RECEIVER); if (!tp_vars) { batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Unexpected packet from %pM!\n", @@ -1410,13 +1471,6 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv, } } - if (unlikely(tp_vars->role != BATADV_TP_RECEIVER)) { - batadv_dbg(BATADV_DBG_TP_METER, bat_priv, - "Meter: dropping packet: not expected (role=%u)\n", - tp_vars->role); - goto out; - } - tp_vars->last_recv_time = jiffies; /* if the packet is a duplicate, it may be the case that an ACK has been @@ -1464,6 +1518,9 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_icmp_tp_packet *icmp; + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + goto out; + icmp = (struct batadv_icmp_tp_packet *)skb->data; switch (icmp->subtype) { @@ -1478,10 +1535,62 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) "Received unknown TP Metric packet type %u\n", icmp->subtype); } + +out: consume_skb(skb); } /** + * batadv_tp_stop_all() - stop all currently running tp meter sessions + * @bat_priv: the bat priv with all the mesh interface information + */ +void batadv_tp_stop_all(struct batadv_priv *bat_priv) +{ + struct batadv_tp_vars *tp_vars[BATADV_TP_MAX_NUM]; + struct batadv_tp_vars *tp_var; + size_t count = 0; + size_t i; + + spin_lock_bh(&bat_priv->tp_list_lock); + hlist_for_each_entry(tp_var, &bat_priv->tp_list, list) { + if (WARN_ON_ONCE(count >= BATADV_TP_MAX_NUM)) + break; + + if (!kref_get_unless_zero(&tp_var->refcount)) + continue; + + tp_vars[count++] = tp_var; + } + spin_unlock_bh(&bat_priv->tp_list_lock); + + for (i = 0; i < count; i++) { + tp_var = tp_vars[i]; + + switch (tp_var->role) { + case BATADV_TP_SENDER: + batadv_tp_sender_shutdown(tp_var, + BATADV_TP_REASON_CANCEL); + wake_up(&tp_var->more_bytes); + wait_for_completion(&tp_var->finished); + break; + case BATADV_TP_RECEIVER: + batadv_tp_list_detach(tp_var); + timer_shutdown_sync(&tp_var->timer); + + if (atomic_xchg(&tp_var->receiving, 0) != 1) + break; + + batadv_tp_vars_put(tp_var); + break; + } + + batadv_tp_vars_put(tp_var); + } + + synchronize_net(); +} + +/** * batadv_tp_meter_init() - initialize global tp_meter structures */ void __init batadv_tp_meter_init(void) diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index f0046d366eac..4e97cd10cd02 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -17,6 +17,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, u32 test_length, u32 *cookie); void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst, u8 return_value); +void batadv_tp_stop_all(struct batadv_priv *bat_priv); void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb); #endif /* _NET_BATMAN_ADV_TP_METER_H_ */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 05cddcf994f6..9f6e67771ffa 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -797,24 +797,33 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, s32 *tt_len) { u16 num_vlan = 0; - u16 num_entries = 0; u16 tvlv_len = 0; unsigned int change_offset; struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_orig_node_vlan *vlan; + u16 total_entries = 0; u8 *tt_change_ptr; + int vlan_entries; + u16 sum_entries; spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { + vlan_entries = atomic_read(&vlan->tt.num_entries); + + if (check_add_overflow(vlan_entries, total_entries, &sum_entries)) { + *tt_len = 0; + goto out; + } + + total_entries = sum_entries; num_vlan++; - num_entries += atomic_read(&vlan->tt.num_entries); } change_offset = struct_size(*tt_data, vlan_data, num_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) - *tt_len = batadv_tt_len(num_entries); + *tt_len = batadv_tt_len(total_entries); if (change_offset > U16_MAX || *tt_len > U16_MAX - change_offset) { *tt_len = 0; @@ -835,14 +844,26 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (*tt_data)->vlan_data; + num_vlan = 0; hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { + vlan_entries = atomic_read(&vlan->tt.num_entries); + if (vlan_entries < 1) + continue; + tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); tt_vlan->reserved = 0; tt_vlan++; + num_vlan++; } + /* recalculate in case number of VLANs reduced */ + change_offset = struct_size(*tt_data, vlan_data, num_vlan); + tvlv_len = *tt_len + change_offset; + + (*tt_data)->num_vlan = htons(num_vlan); + tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; @@ -877,21 +898,25 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_meshif_vlan *vlan; + size_t change_offset; u16 num_vlan = 0; - u16 vlan_entries = 0; u16 total_entries = 0; u16 tvlv_len; u8 *tt_change_ptr; - int change_offset; + int vlan_entries; + u16 sum_entries; spin_lock_bh(&bat_priv->meshif_vlan_list_lock); hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); - if (vlan_entries < 1) - continue; + if (check_add_overflow(vlan_entries, total_entries, &sum_entries)) { + tvlv_len = 0; + goto out; + } + + total_entries = sum_entries; num_vlan++; - total_entries += vlan_entries; } change_offset = struct_size(*tt_data, vlan_data, num_vlan); @@ -900,8 +925,10 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, if (*tt_len < 0) *tt_len = batadv_tt_len(total_entries); - tvlv_len = *tt_len; - tvlv_len += change_offset; + if (check_add_overflow(*tt_len, change_offset, &tvlv_len)) { + tvlv_len = 0; + goto out; + } *tt_data = kmalloc(tvlv_len, GFP_ATOMIC); if (!*tt_data) { @@ -914,6 +941,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (*tt_data)->vlan_data; + num_vlan = 0; hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) @@ -924,8 +952,15 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, tt_vlan->reserved = 0; tt_vlan++; + num_vlan++; } + /* recalculate in case number of VLANs reduced */ + change_offset = struct_size(*tt_data, vlan_data, num_vlan); + tvlv_len = *tt_len + change_offset; + + (*tt_data)->num_vlan = htons(num_vlan); + tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 8129a3f9c44d..cc6ac580c620 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -8,10 +8,12 @@ #include <linux/byteorder/generic.h> #include <linux/container_of.h> +#include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/kref.h> +#include <linux/limits.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -159,10 +161,10 @@ batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) * * Return: size of all currently registered tvlv containers in bytes. */ -static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) +static size_t batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) { struct batadv_tvlv_container *tvlv; - u16 tvlv_len = 0; + size_t tvlv_len = 0; lockdep_assert_held(&bat_priv->tvlv.container_list_lock); @@ -306,26 +308,35 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, * The ogm packet might be enlarged or shrunk depending on the current size * and the size of the to-be-appended tvlv containers. * - * Return: size of all appended tvlv containers in bytes. + * Return: size of all appended tvlv containers in bytes (max U16_MAX), negative + * if operation failed */ -u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, +int batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, unsigned char **packet_buff, int *packet_buff_len, int packet_min_len) { struct batadv_tvlv_container *tvlv; struct batadv_tvlv_hdr *tvlv_hdr; - u16 tvlv_value_len; + size_t tvlv_value_len; void *tvlv_value; + int tvlv_len_ret; bool ret; spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_value_len = batadv_tvlv_container_list_size(bat_priv); + if (tvlv_value_len > U16_MAX) { + tvlv_len_ret = -E2BIG; + goto end; + } ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len, packet_min_len, tvlv_value_len); - - if (!ret) + if (!ret) { + tvlv_len_ret = -ENOMEM; goto end; + } + + tvlv_len_ret = tvlv_value_len; if (!tvlv_value_len) goto end; @@ -344,7 +355,8 @@ u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, end: spin_unlock_bh(&bat_priv->tvlv.container_list_lock); - return tvlv_value_len; + + return tvlv_len_ret; } /** diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index e5697230d991..f96f6b3f44a0 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -16,7 +16,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len); -u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, +int batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, unsigned char **packet_buff, int *packet_buff_len, int packet_min_len); void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 8fc5fe0e9b05..a01ee46d97f3 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -14,6 +14,7 @@ #include <linux/average.h> #include <linux/bitops.h> #include <linux/compiler.h> +#include <linux/completion.h> #include <linux/if.h> #include <linux/if_ether.h> #include <linux/kref.h> @@ -82,6 +83,9 @@ struct batadv_hard_iface_bat_iv { /** @ogm_seqno: OGM sequence number - used to identify each OGM */ atomic_t ogm_seqno; + /** @reschedule_work: recover OGM schedule after schedule error */ + struct delayed_work reschedule_work; + /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */ struct mutex ogm_buff_mutex; }; @@ -300,7 +304,7 @@ struct batadv_frag_table_entry { u16 seqno; /** @size: accumulated size of packets in list */ - u16 size; + size_t size; /** @total_size: expected size of the assembled packet */ u16 total_size; @@ -451,7 +455,7 @@ struct batadv_orig_node { * @tt_buff_len: length of the last tt changeset this node received * from the orig node */ - s16 tt_buff_len; + u16 tt_buff_len; /** @tt_buff_lock: lock that protects tt_buff and tt_buff_len */ spinlock_t tt_buff_lock; @@ -992,7 +996,7 @@ struct batadv_priv_tt { * @last_changeset_len: length of last tt changeset this host has * generated */ - s16 last_changeset_len; + u16 last_changeset_len; /** * @last_changeset_lock: lock protecting last_changeset & @@ -1023,6 +1027,12 @@ struct batadv_priv_bla { atomic_t num_requests; /** + * @num_requests_lock: locks update num_requests + + * batadv_backbone_gw::state + batadv_backbone_gw::wait_periods update + */ + spinlock_t num_requests_lock; + + /** * @claim_hash: hash table containing mesh nodes this host has claimed */ struct batadv_hashtable *claim_hash; @@ -1319,15 +1329,21 @@ struct batadv_tp_vars { /** @role: receiver/sender modi */ enum batadv_tp_meter_role role; - /** @sending: sending binary semaphore: 1 if sending, 0 is not */ - atomic_t sending; + /** + * @send_result: 0 when sending is ongoing and otherwise + * enum batadv_tp_meter_reason + */ + atomic_t send_result; - /** @reason: reason for a stopped session */ - enum batadv_tp_meter_reason reason; + /** @receiving: receiving binary semaphore: 1 if receiving, 0 is not */ + atomic_t receiving; /** @finish_work: work item for the finishing procedure */ struct delayed_work finish_work; + /** @finished: completion signaled when a sender thread exits */ + struct completion finished; + /** @test_length: test length in milliseconds */ u32 test_length; @@ -1662,6 +1678,27 @@ struct batadv_priv { #ifdef CONFIG_BATMAN_ADV_BLA +enum batadv_bla_backbone_gw_state { + /** + * @BATADV_BLA_BACKBONE_GW_STOPPED: backbone gw is being removed + * and it must not longer work on requests + */ + BATADV_BLA_BACKBONE_GW_STOPPED, + + /** + * @BATADV_BLA_BACKBONE_GW_UNSYNCED: backbone was detected out + * of sync and a request was send. No traffic is forwarded until the + * situation is resolved + */ + BATADV_BLA_BACKBONE_GW_UNSYNCED, + + /** + * @BATADV_BLA_BACKBONE_GW_SYNCED: backbone is consider to be in + * sync. traffic can be forwarded + */ + BATADV_BLA_BACKBONE_GW_SYNCED, +}; + /** * struct batadv_bla_backbone_gw - batman-adv gateway bridged into the LAN */ @@ -1687,16 +1724,12 @@ struct batadv_bla_backbone_gw { /** * @wait_periods: grace time for bridge forward delays and bla group * forming at bootup phase - no bcast traffic is formwared until it has - * elapsed + * elapsed. Must only be access with num_requests_lock. */ - atomic_t wait_periods; + u8 wait_periods; - /** - * @request_sent: if this bool is set to true we are out of sync with - * this backbone gateway - no bcast traffic is formwared until the - * situation was resolved - */ - atomic_t request_sent; + /** @state: sync state. Must only be access with num_requests_lock. */ + enum batadv_bla_backbone_gw_state state; /** @crc: crc16 checksum over all claims */ u16 crc; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 2f03b780b40d..960a19b3e26d 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -486,6 +486,8 @@ static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) int ret; local_skb = skb_clone(skb, GFP_ATOMIC); + if (!local_skb) + continue; BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 33d053d63407..1a6aa3f8d4d6 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -154,6 +154,7 @@ struct sock *bt_sock_alloc(struct net *net, struct socket *sock, sock_init_data(sock, sk); INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + spin_lock_init(&bt_sk(sk)->accept_q_lock); sock_reset_flag(sk, SOCK_ZAPPED); @@ -214,6 +215,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) { const struct cred *old_cred; struct pid *old_pid; + struct bt_sock *par = bt_sk(parent); BT_DBG("parent %p, sk %p", parent, sk); @@ -224,9 +226,13 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) else lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); bt_sk(sk)->parent = parent; + spin_lock_bh(&par->accept_q_lock); + list_add_tail(&bt_sk(sk)->accept_q, &par->accept_q); + sk_acceptq_added(parent); + spin_unlock_bh(&par->accept_q_lock); + /* Copy credentials from parent since for incoming connections the * socket is allocated by the kernel. */ @@ -244,8 +250,6 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) bh_unlock_sock(sk); else release_sock(sk); - - sk_acceptq_added(parent); } EXPORT_SYMBOL(bt_accept_enqueue); @@ -254,45 +258,72 @@ EXPORT_SYMBOL(bt_accept_enqueue); */ void bt_accept_unlink(struct sock *sk) { + struct sock *parent = bt_sk(sk)->parent; + BT_DBG("sk %p state %d", sk, sk->sk_state); + spin_lock_bh(&bt_sk(parent)->accept_q_lock); list_del_init(&bt_sk(sk)->accept_q); - sk_acceptq_removed(bt_sk(sk)->parent); + sk_acceptq_removed(parent); + spin_unlock_bh(&bt_sk(parent)->accept_q_lock); bt_sk(sk)->parent = NULL; sock_put(sk); } EXPORT_SYMBOL(bt_accept_unlink); +static struct sock *bt_accept_get(struct sock *parent, struct sock *sk) +{ + struct bt_sock *bt = bt_sk(parent); + struct sock *next = NULL; + + /* accept_q is modified from child teardown paths too, so take a + * temporary reference before dropping the queue lock. + */ + spin_lock_bh(&bt->accept_q_lock); + + if (sk) { + if (bt_sk(sk)->parent != parent) + goto out; + + if (!list_is_last(&bt_sk(sk)->accept_q, &bt->accept_q)) { + next = &list_next_entry(bt_sk(sk), accept_q)->sk; + sock_hold(next); + } + } else if (!list_empty(&bt->accept_q)) { + next = &list_first_entry(&bt->accept_q, + struct bt_sock, accept_q)->sk; + sock_hold(next); + } + +out: + spin_unlock_bh(&bt->accept_q_lock); + return next; +} + struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) { - struct bt_sock *s, *n; - struct sock *sk; + struct sock *sk, *next; BT_DBG("parent %p", parent); restart: - list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { - sk = (struct sock *)s; - + for (sk = bt_accept_get(parent, NULL); sk; sk = next) { /* Prevent early freeing of sk due to unlink and sock_kill */ - sock_hold(sk); lock_sock(sk); /* Check sk has not already been unlinked via * bt_accept_unlink() due to serialisation caused by sk locking */ - if (!bt_sk(sk)->parent) { + if (bt_sk(sk)->parent != parent) { BT_DBG("sk %p, already unlinked", sk); release_sock(sk); sock_put(sk); - /* Restart the loop as sk is no longer in the list - * and also avoid a potential infinite loop because - * list_for_each_entry_safe() is not thread safe. - */ goto restart; } + next = bt_accept_get(parent, sk); + /* sk is safely in the parent list so reduce reference count */ sock_put(sk); @@ -309,7 +340,19 @@ restart: if (newsock) sock_graft(sk, newsock); + /* Hand the caller a reference taken while sk is + * still locked. bt_accept_unlink() just dropped + * the accept-queue reference; without this hold a + * concurrent teardown (e.g. l2cap_conn_del() -> + * l2cap_sock_kill()) could free sk between + * release_sock() and the caller using it. Every + * caller drops this with sock_put() when done. + */ + sock_hold(sk); + release_sock(sk); + if (next) + sock_put(next); return sk; } @@ -518,18 +561,28 @@ EXPORT_SYMBOL(bt_sock_stream_recvmsg); static inline __poll_t bt_accept_poll(struct sock *parent) { - struct bt_sock *s, *n; + struct bt_sock *bt = bt_sk(parent); + struct bt_sock *s; struct sock *sk; + __poll_t mask = 0; + + spin_lock_bh(&bt->accept_q_lock); + list_for_each_entry(s, &bt->accept_q, accept_q) { + int state; - list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { sk = (struct sock *)s; - if (sk->sk_state == BT_CONNECTED || - (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) && - sk->sk_state == BT_CONNECT2)) - return EPOLLIN | EPOLLRDNORM; + state = READ_ONCE(sk->sk_state); + + if (state == BT_CONNECTED || + (test_bit(BT_SK_DEFER_SETUP, &bt->flags) && + state == BT_CONNECT2)) { + mask = EPOLLIN | EPOLLRDNORM; + break; + } } + spin_unlock_bh(&bt->accept_q_lock); - return 0; + return mask; } __poll_t bt_sock_poll(struct file *file, struct socket *sock, diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index d44987d4515c..5c5f53ff30e8 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -206,14 +206,11 @@ static int bnep_ctrl_set_mcfilter(struct bnep_session *s, u8 *data, int len) return 0; } -static int bnep_rx_control(struct bnep_session *s, void *data, int len) +static int bnep_rx_control_cmd(struct bnep_session *s, u8 cmd, void *data, + int len) { - u8 cmd = *(u8 *)data; int err = 0; - data++; - len--; - switch (cmd) { case BNEP_CMD_NOT_UNDERSTOOD: case BNEP_SETUP_CONN_RSP: @@ -254,6 +251,14 @@ static int bnep_rx_control(struct bnep_session *s, void *data, int len) return err; } +static int bnep_rx_control(struct bnep_session *s, void *data, int len) +{ + if (len < 1) + return -EILSEQ; + + return bnep_rx_control_cmd(s, *(u8 *)data, data + 1, len - 1); +} + static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb) { struct bnep_ext_hdr *h; @@ -299,19 +304,26 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) { struct net_device *dev = s->dev; struct sk_buff *nskb; + u8 *data; u8 type, ctrl_type; dev->stats.rx_bytes += skb->len; - type = *(u8 *) skb->data; - skb_pull(skb, 1); - ctrl_type = *(u8 *)skb->data; + data = skb_pull_data(skb, sizeof(type)); + if (!data) + goto badframe; + type = *data; if ((type & BNEP_TYPE_MASK) >= sizeof(__bnep_rx_hlen)) goto badframe; if ((type & BNEP_TYPE_MASK) == BNEP_CONTROL) { - if (bnep_rx_control(s, skb->data, skb->len) < 0) { + data = skb_pull_data(skb, sizeof(ctrl_type)); + if (!data) + goto badframe; + ctrl_type = *data; + + if (bnep_rx_control_cmd(s, ctrl_type, skb->data, skb->len) < 0) { dev->stats.tx_errors++; kfree_skb(skb); return 0; @@ -324,15 +336,25 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) /* Verify and pull ctrl message since it's already processed */ switch (ctrl_type) { - case BNEP_SETUP_CONN_REQ: - /* Pull: ctrl type (1 b), len (1 b), data (len bytes) */ - if (!skb_pull(skb, 2 + *(u8 *)(skb->data + 1) * 2)) + case BNEP_SETUP_CONN_REQ: { + u8 uuid_size; + + /* Pull uuid_size and the dst/src service UUIDs. */ + data = skb_pull_data(skb, sizeof(uuid_size)); + if (!data) + goto badframe; + uuid_size = *data; + if (!skb_pull(skb, uuid_size + uuid_size)) goto badframe; break; + } case BNEP_FILTER_MULTI_ADDR_SET: case BNEP_FILTER_NET_TYPE_SET: - /* Pull: ctrl type (1 b), len (2 b), data (len bytes) */ - if (!skb_pull(skb, 3 + *(u16 *)(skb->data + 1) * 2)) + /* Pull: len (2 b), data (len bytes) */ + data = skb_pull_data(skb, sizeof(u16)); + if (!data) + goto badframe; + if (!skb_pull(skb, get_unaligned_be16(data))) goto badframe; break; default: @@ -638,8 +660,8 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) goto failed; } - up_write(&bnep_session_sem); strcpy(req->device, dev->name); + up_write(&bnep_session_sem); return 0; failed: diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 3a0592599086..54eabaa46960 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -480,40 +480,107 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) return hci_setup_sync_conn(conn, handle); } -u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, - u16 to_multiplier) +struct le_conn_update_data { + struct hci_conn *conn; + u16 min; + u16 max; + u16 latency; + u16 to_multiplier; +}; + +static int le_conn_update_sync(struct hci_dev *hdev, void *data) { - struct hci_dev *hdev = conn->hdev; + struct le_conn_update_data *d = data; + struct hci_conn *conn = d->conn; struct hci_conn_params *params; struct hci_cp_le_conn_update cp; + u16 timeout; + u8 store_hint; + int err; + /* Verify connection is still alive and read conn fields under + * the same lock to prevent a concurrent disconnect from freeing + * or reusing the connection while we build the HCI command. + */ hci_dev_lock(hdev); - params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); - if (params) { - params->conn_min_interval = min; - params->conn_max_interval = max; - params->conn_latency = latency; - params->supervision_timeout = to_multiplier; + if (!hci_conn_valid(hdev, conn)) { + hci_dev_unlock(hdev); + return -ECANCELED; } - hci_dev_unlock(hdev); - memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); - cp.conn_interval_min = cpu_to_le16(min); - cp.conn_interval_max = cpu_to_le16(max); - cp.conn_latency = cpu_to_le16(latency); - cp.supervision_timeout = cpu_to_le16(to_multiplier); + cp.conn_interval_min = cpu_to_le16(d->min); + cp.conn_interval_max = cpu_to_le16(d->max); + cp.conn_latency = cpu_to_le16(d->latency); + cp.supervision_timeout = cpu_to_le16(d->to_multiplier); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); + timeout = conn->conn_timeout; - hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp); + hci_dev_unlock(hdev); - if (params) - return 0x01; + err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CONN_UPDATE, + sizeof(cp), &cp, + HCI_EV_LE_CONN_UPDATE_COMPLETE, + timeout, NULL); + if (err) + return err; - return 0x00; + /* Update stored connection parameters after the controller has + * confirmed the update via the LE Connection Update Complete event. + */ + hci_dev_lock(hdev); + + params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); + if (params) { + params->conn_min_interval = d->min; + params->conn_max_interval = d->max; + params->conn_latency = d->latency; + params->supervision_timeout = d->to_multiplier; + store_hint = 0x01; + } else { + store_hint = 0x00; + } + + hci_dev_unlock(hdev); + + mgmt_new_conn_param(hdev, &conn->dst, conn->dst_type, store_hint, + d->min, d->max, d->latency, d->to_multiplier); + + return 0; +} + +static void le_conn_update_complete(struct hci_dev *hdev, void *data, int err) +{ + struct le_conn_update_data *d = data; + + hci_conn_put(d->conn); + kfree(d); +} + +void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, + u16 to_multiplier) +{ + struct le_conn_update_data *d; + + d = kzalloc_obj(*d); + if (!d) + return; + + hci_conn_get(conn); + d->conn = conn; + d->min = min; + d->max = max; + d->latency = latency; + d->to_multiplier = to_multiplier; + + if (hci_cmd_sync_queue(conn->hdev, le_conn_update_sync, d, + le_conn_update_complete) < 0) { + hci_conn_put(conn); + kfree(d); + } } void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, @@ -803,8 +870,10 @@ static int hci_le_big_terminate(struct hci_dev *hdev, struct hci_conn *conn) d->big_sync_term = true; } - if (!d->pa_sync_term && !d->big_sync_term) + if (!d->pa_sync_term && !d->big_sync_term) { + kfree(d); return 0; + } ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d, terminate_big_destroy); @@ -2130,6 +2199,9 @@ static int create_big_sync(struct hci_dev *hdev, void *data) u32 flags = 0; int err; + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + if (qos->bcast.out.phys == BIT(1)) flags |= MGMT_ADV_FLAG_SEC_2M; @@ -2204,11 +2276,24 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "conn %p", conn); + if (err == -ECANCELED) + goto done; + + hci_dev_lock(hdev); + + if (!hci_conn_valid(hdev, conn)) + goto unlock; + if (err) { bt_dev_err(hdev, "Unable to create BIG: %d", err); hci_connect_cfm(conn, err); hci_conn_del(conn); } + +unlock: + hci_dev_unlock(hdev); +done: + hci_conn_put(conn); } struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, @@ -2336,10 +2421,11 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ - err = hci_cmd_sync_queue(hdev, create_big_sync, conn, + err = hci_cmd_sync_queue(hdev, create_big_sync, hci_conn_get(conn), create_big_complete); if (err < 0) { hci_conn_drop(conn); + hci_conn_put(conn); return ERR_PTR(err); } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index c46c1236ebfa..28d7929dc593 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -539,46 +539,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev) hci_req_sync_lock(hdev); - /* Drop queues */ - skb_queue_purge(&hdev->rx_q); - skb_queue_purge(&hdev->cmd_q); - - /* Cancel these to avoid queueing non-chained pending work */ - hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); - /* Wait for - * - * if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) - * queue_delayed_work(&hdev->{cmd,ncmd}_timer) - * - * inside RCU section to see the flag or complete scheduling. - */ - synchronize_rcu(); - /* Explicitly cancel works in case scheduled after setting the flag. */ - cancel_delayed_work(&hdev->cmd_timer); - cancel_delayed_work(&hdev->ncmd_timer); - - /* Avoid potential lockdep warnings from the *_flush() calls by - * ensuring the workqueue is empty up front. - */ - drain_workqueue(hdev->workqueue); - - hci_dev_lock(hdev); - hci_inquiry_cache_flush(hdev); - hci_conn_hash_flush(hdev); - hci_dev_unlock(hdev); - - if (hdev->flush) - hdev->flush(hdev); - - hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); - - atomic_set(&hdev->cmd_cnt, 1); - hdev->acl_cnt = 0; - hdev->sco_cnt = 0; - hdev->le_cnt = 0; - hdev->iso_cnt = 0; - - ret = hci_reset_sync(hdev); + ret = hci_dev_close_sync(hdev); + if (!ret) + ret = hci_dev_open_sync(hdev); hci_req_sync_unlock(hdev); return ret; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b2ee6b6a0f56..eea2f810aafa 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -7118,9 +7118,29 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, continue; } + if (ev->num_bis <= i) { + bt_dev_err(hdev, + "Not enough BIS handles for BIG 0x%2.2x", + ev->handle); + ev->status = HCI_ERROR_UNSPECIFIED; + hci_connect_cfm(conn, ev->status); + hci_conn_del(conn); + continue; + } + if (hci_conn_set_handle(conn, - __le16_to_cpu(ev->bis_handle[i++]))) + __le16_to_cpu(ev->bis_handle[i++]))) { + bt_dev_err(hdev, + "Failed to set BIS handle for BIG 0x%2.2x", + ev->handle); + /* Force error so BIG gets terminated as not all BIS + * could be connected. + */ + ev->status = HCI_ERROR_UNSPECIFIED; + hci_connect_cfm(conn, ev->status); + hci_conn_del(conn); continue; + } conn->state = BT_CONNECTED; set_bit(HCI_CONN_BIG_CREATED, &conn->flags); @@ -7129,7 +7149,10 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, hci_iso_setup_path(conn); } - if (!ev->status && !i) + /* If there is an unexpected error or if no BISes have been connected + * for the BIG, terminate it. + */ + if (ev->status == HCI_ERROR_UNSPECIFIED || (!ev->status && !i)) /* If no BISes have been connected for the BIG, * terminate. This is in case all bound connections * have been closed before the BIG creation @@ -7168,7 +7191,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); conn->num_bis = 0; - memset(conn->bis, 0, sizeof(conn->num_bis)); + memset(conn->bis, 0, sizeof(conn->bis)); for (i = 0; i < ev->num_bis; i++) { u16 handle = le16_to_cpu(ev->bis[i]); diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index fd3aacdea512..df23245d6ccd 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1725,6 +1725,11 @@ static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv) /* Generate Broadcast ID */ get_random_bytes(bid, sizeof(bid)); len = eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid)); + if (adv->adv_data_len > sizeof(ad) - len) { + bt_dev_err(hdev, "No room for Broadcast Announcement"); + return -EINVAL; + } + memcpy(ad + len, adv->adv_data, adv->adv_data_len); hci_set_adv_instance_data(hdev, adv->instance, len + adv->adv_data_len, ad, 0, NULL); @@ -4438,6 +4443,9 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) events[4] |= 0x02; /* LE BIG Info Advertising Report */ } + if (ll_ext_feature_capable(hdev)) + events[5] |= BIT(2); + if (le_cs_capable(hdev)) { /* Channel Sounding events */ events[5] |= 0x08; /* LE CS Read Remote Supported Cap Complete event */ @@ -5298,6 +5306,12 @@ int hci_dev_close_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, ""); + /* Set HCI_DRAIN_WORKQUEUE flag to prevent queuing work during + * reset/close. See hci_cmd_work() and handle_cmd_cnt_and_timer(). + */ + hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); + synchronize_rcu(); + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { disable_delayed_work(&hdev->power_off); disable_delayed_work(&hdev->ncmd_timer); @@ -5321,6 +5335,7 @@ int hci_dev_close_sync(struct hci_dev *hdev) if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); + hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); return err; } @@ -5383,6 +5398,10 @@ int hci_dev_close_sync(struct hci_dev *hdev) /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); + hdev->acl_cnt = 0; + hdev->sco_cnt = 0; + hdev->le_cnt = 0; + hdev->iso_cnt = 0; if (hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE) && !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { set_bit(HCI_INIT, &hdev->flags); @@ -5420,6 +5439,7 @@ int hci_dev_close_sync(struct hci_dev *hdev) /* Clear flags */ hdev->flags &= BIT(HCI_RAW); hci_dev_clear_volatile_flags(hdev); + hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); memset(hdev->eir, 0, sizeof(hdev->eir)); memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); @@ -6696,6 +6716,7 @@ int hci_le_create_cis_sync(struct hci_dev *hdev) DEFINE_FLEX(struct hci_cp_le_create_cis, cmd, cis, num_cis, 0x1f); size_t aux_num_cis = 0; struct hci_conn *conn; + u16 timeout = 0; u8 cig = BT_ISO_QOS_CIG_UNSET; /* The spec allows only one pending LE Create CIS command at a time. If @@ -6766,6 +6787,7 @@ int hci_le_create_cis_sync(struct hci_dev *hdev) set_bit(HCI_CONN_CREATE_CIS, &conn->flags); cis->acl_handle = cpu_to_le16(conn->parent->handle); cis->cis_handle = cpu_to_le16(conn->handle); + timeout = conn->conn_timeout; aux_num_cis++; if (aux_num_cis >= cmd->num_cis) @@ -6785,7 +6807,7 @@ done: return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS, struct_size(cmd, cis, cmd->num_cis), cmd, HCI_EVT_LE_CIS_ESTABLISHED, - conn->conn_timeout, NULL); + timeout, NULL); } int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle) @@ -7413,9 +7435,6 @@ static int hci_le_read_all_remote_features_sync(struct hci_dev *hdev, sizeof(cp), &cp, HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE, HCI_CMD_TIMEOUT, NULL); - - return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES, - sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_read_remote_features_sync(struct hci_dev *hdev, void *data) diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 041ce9adc378..8957ce7c21b7 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -83,10 +83,12 @@ static void bt_host_release(struct device *dev) { struct hci_dev *hdev = to_hci_dev(dev); - if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { hci_release_dev(hdev); - else + } else { + cleanup_srcu_struct(&hdev->srcu); kfree(hdev); + } module_put(THIS_MODULE); } diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 7bcf8c5ceaee..70344bd3248a 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -179,12 +179,21 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) { struct input_dev *dev = session->input; unsigned char *keys = session->keys; - unsigned char *udata = skb->data + 1; - signed char *sdata = skb->data + 1; - int i, size = skb->len - 1; + unsigned char *udata; + signed char *sdata; + u8 *hdr; + int i; + + hdr = skb_pull_data(skb, 1); + if (!hdr) + return; - switch (skb->data[0]) { + switch (*hdr) { case 0x01: /* Keyboard report */ + udata = skb_pull_data(skb, 8); + if (!udata) + break; + for (i = 0; i < 8; i++) input_report_key(dev, hidp_keycode[i + 224], (udata[0] >> i) & 1); @@ -213,6 +222,10 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) break; case 0x02: /* Mouse report */ + sdata = skb_pull_data(skb, 3); + if (!sdata) + break; + input_report_key(dev, BTN_LEFT, sdata[0] & 0x01); input_report_key(dev, BTN_RIGHT, sdata[0] & 0x02); input_report_key(dev, BTN_MIDDLE, sdata[0] & 0x04); @@ -222,7 +235,7 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) input_report_rel(dev, REL_X, sdata[1]); input_report_rel(dev, REL_Y, sdata[2]); - if (size > 3) + if (skb->len > 0) input_report_rel(dev, REL_WHEEL, sdata[3]); break; } @@ -1036,6 +1049,28 @@ static struct hidp_session *hidp_session_find(const bdaddr_t *bdaddr) } /* + * Consume session->conn: clear the member under hidp_session_sem, then + * l2cap_unregister_user() and l2cap_conn_put() the snapshot outside the + * sem. At most one caller wins; later callers see NULL and skip. The + * reference is the one hidp_session_new() took via l2cap_conn_get(). + */ +static void hidp_session_unregister_conn(struct hidp_session *session) +{ + struct l2cap_conn *conn; + + down_write(&hidp_session_sem); + conn = session->conn; + if (conn) + session->conn = NULL; + up_write(&hidp_session_sem); + + if (conn) { + l2cap_unregister_user(conn, &session->user); + l2cap_conn_put(conn); + } +} + +/* * Start session synchronously * This starts a session thread and waits until initialization * is done or returns an error if it couldn't be started. @@ -1311,8 +1346,7 @@ static int hidp_session_thread(void *arg) * Instead, this call has the same semantics as if user-space tried to * delete the session. */ - if (session->conn) - l2cap_unregister_user(session->conn, &session->user); + hidp_session_unregister_conn(session); hidp_session_put(session); @@ -1418,7 +1452,7 @@ int hidp_connection_del(struct hidp_conndel_req *req) HIDP_CTRL_VIRTUAL_CABLE_UNPLUG, NULL, 0); else - l2cap_unregister_user(session->conn, &session->user); + hidp_session_unregister_conn(session); hidp_session_put(session); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index be145e2736b7..3abd8111dda8 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -337,16 +337,25 @@ static int iso_connect_bis(struct sock *sk) struct iso_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; + bdaddr_t src, dst; + u8 src_type, bc_sid; int err; - BT_DBG("%pMR (SID 0x%2.2x)", &iso_pi(sk)->src, iso_pi(sk)->bc_sid); + lock_sock(sk); + bacpy(&src, &iso_pi(sk)->src); + bacpy(&dst, &iso_pi(sk)->dst); + src_type = iso_pi(sk)->src_type; + bc_sid = iso_pi(sk)->bc_sid; + release_sock(sk); - hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, - iso_pi(sk)->src_type); + BT_DBG("%pMR (SID 0x%2.2x)", &src, bc_sid); + + hdev = hci_get_route(&dst, &src, src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); + lock_sock(sk); if (!bis_capable(hdev)) { err = -EOPNOTSUPP; @@ -399,13 +408,9 @@ static int iso_connect_bis(struct sock *sk) goto unlock; } - lock_sock(sk); - err = iso_chan_add(conn, sk, NULL); - if (err) { - release_sock(sk); + if (err) goto unlock; - } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); @@ -421,9 +426,8 @@ static int iso_connect_bis(struct sock *sk) iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } - release_sock(sk); - unlock: + release_sock(sk); hci_dev_unlock(hdev); hci_dev_put(hdev); return err; @@ -434,16 +438,24 @@ static int iso_connect_cis(struct sock *sk) struct iso_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; + bdaddr_t src, dst; + u8 src_type; int err; - BT_DBG("%pMR -> %pMR", &iso_pi(sk)->src, &iso_pi(sk)->dst); + lock_sock(sk); + bacpy(&src, &iso_pi(sk)->src); + bacpy(&dst, &iso_pi(sk)->dst); + src_type = iso_pi(sk)->src_type; + release_sock(sk); + + BT_DBG("%pMR -> %pMR", &src, &dst); - hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, - iso_pi(sk)->src_type); + hdev = hci_get_route(&dst, &src, src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); + lock_sock(sk); if (!cis_central_capable(hdev)) { err = -EOPNOTSUPP; @@ -498,13 +510,9 @@ static int iso_connect_cis(struct sock *sk) goto unlock; } - lock_sock(sk); - err = iso_chan_add(conn, sk, NULL); - if (err) { - release_sock(sk); + if (err) goto unlock; - } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); @@ -520,9 +528,8 @@ static int iso_connect_cis(struct sock *sk) iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } - release_sock(sk); - unlock: + release_sock(sk); hci_dev_unlock(hdev); hci_dev_put(hdev); return err; @@ -572,7 +579,7 @@ static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb) struct sock *sk; iso_conn_lock(conn); - sk = conn->sk; + sk = iso_sock_hold(conn); iso_conn_unlock(conn); if (!sk) @@ -581,11 +588,15 @@ static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb) BT_DBG("sk %p len %d", sk, skb->len); if (sk->sk_state != BT_CONNECTED) - goto drop; + goto drop_put; - if (!sock_queue_rcv_skb(sk, skb)) + if (!sock_queue_rcv_skb(sk, skb)) { + sock_put(sk); return; + } +drop_put: + sock_put(sk); drop: kfree_skb(skb); } @@ -759,6 +770,8 @@ static void iso_sock_cleanup_listen(struct sock *parent) while ((sk = bt_accept_dequeue(parent, NULL))) { iso_sock_close(sk); iso_sock_kill(sk); + /* Drop the reference handed back by bt_accept_dequeue(). */ + sock_put(sk); } /* If listening socket has a hcon, properly disconnect it */ @@ -866,8 +879,8 @@ static void __iso_sock_close(struct sock *sk) /* Must be called on unlocked socket. */ static void iso_sock_close(struct sock *sk) { - iso_sock_clear_timer(sk); lock_sock(sk); + iso_sock_clear_timer(sk); __iso_sock_close(sk); release_sock(sk); iso_sock_kill(sk); @@ -1084,7 +1097,7 @@ static int iso_sock_rebind_bc(struct sock *sk, struct sockaddr_iso *sa, * ordering. */ release_sock(sk); - hci_dev_lock(bis->hdev); + hci_dev_lock(hdev); lock_sock(sk); if (!iso_pi(sk)->conn || iso_pi(sk)->conn->hcon != bis) { @@ -1193,7 +1206,7 @@ static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, release_sock(sk); - if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) + if (bacmp(&sa->iso_bdaddr, BDADDR_ANY)) err = iso_connect_cis(sk); else err = iso_connect_bis(sk); @@ -1214,18 +1227,25 @@ static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, static int iso_listen_bis(struct sock *sk) { - struct hci_dev *hdev; - int err = 0; struct iso_conn *conn; struct hci_conn *hcon; + struct hci_dev *hdev; + bdaddr_t src, dst; + u8 src_type, bc_sid; + int err = 0; + + lock_sock(sk); + bacpy(&src, &iso_pi(sk)->src); + bacpy(&dst, &iso_pi(sk)->dst); + src_type = iso_pi(sk)->src_type; + bc_sid = iso_pi(sk)->bc_sid; + release_sock(sk); - BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &iso_pi(sk)->src, - &iso_pi(sk)->dst, iso_pi(sk)->bc_sid); + BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &src, &dst, bc_sid); write_lock(&iso_sk_list.lock); - if (__iso_get_sock_listen_by_sid(&iso_pi(sk)->src, &iso_pi(sk)->dst, - iso_pi(sk)->bc_sid)) + if (__iso_get_sock_listen_by_sid(&src, &dst, bc_sid)) err = -EADDRINUSE; write_unlock(&iso_sk_list.lock); @@ -1233,8 +1253,7 @@ static int iso_listen_bis(struct sock *sk) if (err) return err; - hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, - iso_pi(sk)->src_type); + hdev = hci_get_route(&dst, &src, src_type); if (!hdev) return -EHOSTUNREACH; @@ -1364,8 +1383,13 @@ static int iso_sock_accept(struct socket *sock, struct socket *newsock, } ch = bt_accept_dequeue(sk, newsock); - if (ch) + if (ch) { + /* Drop the bridging ref from bt_accept_dequeue(); + * the grafted socket keeps ch alive from here. + */ + sock_put(ch); break; + } if (!timeo) { err = -EAGAIN; @@ -1565,9 +1589,16 @@ static void iso_conn_big_sync(struct sock *sk) { int err; struct hci_dev *hdev; + bdaddr_t src, dst; + u8 src_type; - hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, - iso_pi(sk)->src_type); + lock_sock(sk); + bacpy(&src, &iso_pi(sk)->src); + bacpy(&dst, &iso_pi(sk)->dst); + src_type = iso_pi(sk)->src_type; + release_sock(sk); + + hdev = hci_get_route(&dst, &src, src_type); if (!hdev) return; @@ -1592,6 +1623,7 @@ static void iso_conn_big_sync(struct sock *sk) release_sock(sk); hci_dev_unlock(hdev); + hci_dev_put(hdev); } static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, @@ -2256,8 +2288,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid, ev1); if (sk && !ev1->status) { + lock_sock(sk); iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); iso_pi(sk)->bc_sid = ev1->sid; + release_sock(sk); } goto done; @@ -2268,8 +2302,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid_past, ev1a); if (sk && !ev1a->status) { + lock_sock(sk); iso_pi(sk)->sync_handle = le16_to_cpu(ev1a->sync_handle); iso_pi(sk)->bc_sid = ev1a->sid; + release_sock(sk); } goto done; @@ -2296,27 +2332,35 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) ev2); if (sk) { - int err; - struct hci_conn *hcon = iso_pi(sk)->conn->hcon; + int err = 0; + bool big_sync; + struct hci_conn *hcon; + lock_sock(sk); + + hcon = iso_pi(sk)->conn->hcon; iso_pi(sk)->qos.bcast.encryption = ev2->encryption; if (ev2->num_bis < iso_pi(sk)->bc_num_bis) iso_pi(sk)->bc_num_bis = ev2->num_bis; - if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && - !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { + big_sync = !test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && + !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags); + + if (big_sync) err = hci_conn_big_create_sync(hdev, hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, iso_pi(sk)->bc_bis); - if (err) { - bt_dev_err(hdev, "hci_le_big_create_sync: %d", - err); - sock_put(sk); - sk = NULL; - } + + release_sock(sk); + + if (big_sync && err) { + bt_dev_err(hdev, "hci_le_big_create_sync: %d", + err); + sock_put(sk); + sk = NULL; } } @@ -2370,8 +2414,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (!base || base_len > BASE_MAX_LENGTH) goto done; + lock_sock(sk); memcpy(iso_pi(sk)->base, base, base_len); iso_pi(sk)->base_len = base_len; + release_sock(sk); } else { /* This is a PA data fragment. Keep pa_data_len set to 0 * until all data has been reassembled. @@ -2587,6 +2633,11 @@ int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, u16 flags) break; case ISO_END: + if (!conn->rx_len) { + BT_ERR("Unexpected end frame (len %d)", skb->len); + goto drop; + } + skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 77dec104a9c3..c4ccfbda9d78 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -411,8 +411,10 @@ static void l2cap_chan_timeout(struct work_struct *work) BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); - if (!conn) + if (!conn) { + l2cap_chan_put(chan); return; + } mutex_lock(&conn->lock); /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling @@ -4706,16 +4708,8 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_PARAM_UPDATE_RSP, sizeof(rsp), &rsp); - if (!err) { - u8 store_hint; - - store_hint = hci_le_conn_update(hcon, min, max, latency, - to_multiplier); - mgmt_new_conn_param(hcon->hdev, &hcon->dst, hcon->dst_type, - store_hint, min, max, latency, - to_multiplier); - - } + if (!err) + hci_le_conn_update(hcon, min, max, latency, to_multiplier); return 0; } @@ -5268,6 +5262,7 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, cmd_len -= sizeof(*rsp); list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { + struct l2cap_chan *orig; u16 dcid; if (chan->ident != cmd->ident || @@ -5289,8 +5284,10 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, BT_DBG("dcid[%d] 0x%4.4x", i, dcid); + orig = __l2cap_get_chan_by_dcid(conn, dcid); + /* Check if dcid is already in use */ - if (dcid && __l2cap_get_chan_by_dcid(conn, dcid)) { + if (dcid && orig) { /* If a device receives a * L2CAP_CREDIT_BASED_CONNECTION_RSP packet with an * already-assigned Destination CID, then both the @@ -5299,10 +5296,24 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, */ l2cap_chan_del(chan, ECONNREFUSED); l2cap_chan_unlock(chan); - chan = __l2cap_get_chan_by_dcid(conn, dcid); - l2cap_chan_lock(chan); - l2cap_chan_del(chan, ECONNRESET); - l2cap_chan_unlock(chan); + + /* Check that the dcid channel mode is + * L2CAP_MODE_EXT_FLOWCTL since this procedure is only + * valid for that mode and shouldn't disconnect a dcid + * in other modes. + */ + if (orig->mode == L2CAP_MODE_EXT_FLOWCTL) { + l2cap_chan_lock(orig); + /* Disconnect the original channel as it may be + * considered connected since dcid has already + * been assigned; don't call l2cap_chan_close + * directly since that could lead to + * l2cap_chan_del and then removing the channel + * from the list while we're iterating over it. + */ + __set_chan_timer(orig, 0); + l2cap_chan_unlock(orig); + } continue; } @@ -5428,7 +5439,7 @@ static inline int l2cap_ecred_reconf_req(struct l2cap_conn *conn, * configured, the MPS field may be less than the current MPS * of that channel. */ - if (chan[i]->remote_mps >= mps && i) { + if (chan[i]->remote_mps > mps && num_scid > 1) { BT_ERR("chan %p decreased MPS %u -> %u", chan[i], chan[i]->remote_mps, mps); result = L2CAP_RECONF_INVALID_MPS; @@ -5466,14 +5477,20 @@ static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn, BT_DBG("result 0x%4.4x", result); - if (!result) + if (!result) { + list_for_each_entry(chan, &conn->chan_l, list) { + if (chan->ident == cmd->ident) + chan->ident = 0; + } return 0; + } list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { if (chan->ident != cmd->ident) continue; - l2cap_chan_hold(chan); + if (!l2cap_chan_hold_unless_zero(chan)) + continue; l2cap_chan_lock(chan); l2cap_chan_del(chan, ECONNRESET); @@ -5626,6 +5643,15 @@ static inline void l2cap_sig_send_rej(struct l2cap_conn *conn, u16 ident) l2cap_send_cmd(conn, ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej); } +static inline void l2cap_sig_send_mtu_rej(struct l2cap_conn *conn, u8 ident) +{ + struct l2cap_cmd_rej_mtu rej; + + rej.reason = cpu_to_le16(L2CAP_REJ_MTU_EXCEEDED); + rej.max_mtu = cpu_to_le16(L2CAP_SIG_MTU); + l2cap_send_cmd(conn, ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej); +} + static inline void l2cap_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) { @@ -5638,6 +5664,43 @@ static inline void l2cap_sig_channel(struct l2cap_conn *conn, if (hcon->type != ACL_LINK) goto drop; + /* + * Bluetooth Core v5.4, Vol 3, Part A, Section 4: the BR/EDR + * signaling channel has a fixed signaling MTU (MTUsig) whose + * minimum and default is 48 octets. Section 4.1 says that on + * an MTUExceeded command reject the identifier "shall match + * the first request command in the L2CAP packet" and that + * packets containing only response commands "shall be + * silently discarded". + * + * Linux intentionally deviates from that prescription: + * + * 1. Silently discarding desynchronizes the peer. The + * remote stack never learns its responses were dropped, + * so any state machine waiting on a paired response + * stalls until its own timer fires. + * + * 2. Locating "the first request command" requires walking + * command headers past MTUsig, i.e. processing bytes + * from a packet we have already decided is too large to + * process. + * + * Reject every over-MTUsig signaling packet with one + * L2CAP_REJ_MTU_EXCEEDED command reject. The reject's + * reason field is what tells the peer that the whole packet + * was discarded; the identifier value is informational, so + * we use the identifier from the first command header, a + * single fixed-offset byte read. + */ + if (skb->len > L2CAP_SIG_MTU) { + u8 ident = skb->data[1]; + + BT_DBG("signaling packet exceeds MTU: %u > %u", + skb->len, L2CAP_SIG_MTU); + l2cap_sig_send_mtu_rej(conn, ident); + goto drop; + } + while (skb->len >= L2CAP_CMD_HDR_SIZE) { u16 len; @@ -7282,7 +7345,7 @@ static void l2cap_ecred_reconfigure(struct l2cap_chan *chan) chan->ident = l2cap_get_ident(conn); l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_RECONF_REQ, - sizeof(pdu), &pdu); + struct_size(pdu, scid, 1), pdu); } int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 71e8c1b45bce..c138aa4ae266 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -349,8 +349,13 @@ static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, } nsk = bt_accept_dequeue(sk, newsock); - if (nsk) + if (nsk) { + /* Drop the bridging ref from bt_accept_dequeue(); + * the grafted socket keeps nsk alive from here. + */ + sock_put(nsk); break; + } if (!timeo) { err = -EAGAIN; @@ -1475,22 +1480,56 @@ static void l2cap_sock_cleanup_listen(struct sock *parent) BT_DBG("parent %p state %s", parent, state_to_string(parent->sk_state)); - /* Close not yet accepted channels */ + /* Close not yet accepted channels. + * + * bt_accept_dequeue() now returns sk with an extra reference held + * (taken while sk was still locked) so a concurrent l2cap_conn_del() + * -> l2cap_sock_kill() cannot free sk under us. + * + * cleanup_listen() runs under the parent sk lock, so unlike + * l2cap_sock_shutdown() we must NOT take conn->lock here: that would + * establish sk_lock -> conn->lock and invert the established + * conn->lock -> chan->lock -> sk_lock order (lockdep deadlock). + * + * Instead, briefly take the child sk lock to fetch and pin its chan. + * l2cap_conn_del() reaches the chan free only via + * l2cap_chan_del() -> l2cap_sock_teardown_cb(), which itself takes + * the child sk lock; holding it across l2cap_chan_hold_unless_zero() + * therefore guarantees the chan cannot be freed while we read and + * pin it (hold_unless_zero() additionally skips a chan already past + * its last reference). We then drop the sk lock before taking + * chan->lock, so sk and chan locks are never held together. + * + * Since we cannot call l2cap_chan_close() without conn->lock, + * schedule l2cap_chan_timeout to close the channel; it already + * acquires conn->lock -> chan->lock in the correct order. + */ while ((sk = bt_accept_dequeue(parent, NULL))) { - struct l2cap_chan *chan = l2cap_pi(sk)->chan; + struct l2cap_chan *chan; + + lock_sock_nested(sk, L2CAP_NESTING_NORMAL); + chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan); + release_sock(sk); + if (!chan) { + /* l2cap_conn_del() already tearing this child down */ + sock_put(sk); + continue; + } BT_DBG("child chan %p state %s", chan, state_to_string(chan->state)); - l2cap_chan_hold(chan); l2cap_chan_lock(chan); - - __clear_chan_timer(chan); - l2cap_chan_close(chan, ECONNRESET); - l2cap_sock_kill(sk); - + /* Since we cannot call l2cap_chan_close() without + * conn->lock, schedule its timer to trigger the close + * and cleanup of this channel. + */ + if (chan->conn) + __set_chan_timer(chan, 0); l2cap_chan_unlock(chan); + l2cap_chan_put(chan); + sock_put(sk); } } @@ -1498,6 +1537,9 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan) { struct sock *sk, *parent = chan->data; + if (!parent) + return NULL; + lock_sock(parent); /* Check for backlog size */ @@ -1657,6 +1699,9 @@ static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state, { struct sock *sk = chan->data; + if (!sk) + return; + sk->sk_state = state; if (err) @@ -1758,6 +1803,9 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; + if (!sk) + return 0; + return READ_ONCE(sk->sk_sndtimeo); } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index b05bb380e5f8..f4aa814a0397 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -8638,6 +8638,12 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, if (!cur_len) continue; + /* If the current field length would exceed the total data + * length, then it's invalid. + */ + if (i + cur_len >= len) + return false; + if (data[i + 1] == EIR_FLAGS && (!is_adv_data || flags_managed(adv_flags))) return false; @@ -8654,12 +8660,6 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, if (data[i + 1] == EIR_APPEARANCE && appearance_managed(adv_flags)) return false; - - /* If the current field length would exceed the total data - * length, then it's invalid. - */ - if (i + cur_len >= len) - return false; } return true; @@ -9110,9 +9110,16 @@ static int add_ext_adv_data(struct sock *sk, struct hci_dev *hdev, void *data, struct adv_info *adv_instance; int err = 0; struct mgmt_pending_cmd *cmd; + u16 expected_len; BT_DBG("%s", hdev->name); + expected_len = struct_size(cp, data, cp->adv_data_len + + cp->scan_rsp_len); + if (expected_len > data_len) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, + MGMT_STATUS_INVALID_PARAMS); + hci_dev_lock(hdev); adv_instance = hci_find_adv_instance(hdev, cp->instance); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 611a9a94151e..364b9381c2dc 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -1431,10 +1431,15 @@ static int rfcomm_apply_pn(struct rfcomm_dlc *d, int cr, struct rfcomm_pn *pn) static int rfcomm_recv_pn(struct rfcomm_session *s, int cr, struct sk_buff *skb) { - struct rfcomm_pn *pn = (void *) skb->data; + struct rfcomm_pn *pn; struct rfcomm_dlc *d; - u8 dlci = pn->dlci; + u8 dlci; + + pn = skb_pull_data(skb, sizeof(*pn)); + if (!pn) + return -EILSEQ; + dlci = pn->dlci; BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); if (!dlci) @@ -1483,8 +1488,8 @@ static int rfcomm_recv_pn(struct rfcomm_session *s, int cr, struct sk_buff *skb) static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_buff *skb) { - struct rfcomm_rpn *rpn = (void *) skb->data; - u8 dlci = __get_dlci(rpn->dlci); + struct rfcomm_rpn *rpn; + u8 dlci; u8 bit_rate = 0; u8 data_bits = 0; @@ -1495,15 +1500,16 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ u8 xoff_char = 0; u16 rpn_mask = RFCOMM_RPN_PM_ALL; - BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", - dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl, - rpn->xon_char, rpn->xoff_char, rpn->param_mask); + if (len == 1) { + rpn = skb_pull_data(skb, 1); + if (!rpn) + return -EILSEQ; - if (!cr) - return 0; + dlci = __get_dlci(rpn->dlci); + + if (!cr) + return 0; - if (len == 1) { - /* This is a request, return default (according to ETSI TS 07.10) settings */ bit_rate = RFCOMM_RPN_BR_9600; data_bits = RFCOMM_RPN_DATA_8; stop_bits = RFCOMM_RPN_STOP_1; @@ -1514,6 +1520,19 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ goto rpn_out; } + rpn = skb_pull_data(skb, sizeof(*rpn)); + if (!rpn) + return -EILSEQ; + + dlci = __get_dlci(rpn->dlci); + + BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", + dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl, + rpn->xon_char, rpn->xoff_char, rpn->param_mask); + + if (!cr) + return 0; + /* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit, * no parity, no flow control lines, normal XON/XOFF chars */ @@ -1589,9 +1608,14 @@ rpn_out: static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb) { - struct rfcomm_rls *rls = (void *) skb->data; - u8 dlci = __get_dlci(rls->dlci); + struct rfcomm_rls *rls; + u8 dlci; + + rls = skb_pull_data(skb, sizeof(*rls)); + if (!rls) + return -EILSEQ; + dlci = __get_dlci(rls->dlci); BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status); if (!cr) @@ -1608,10 +1632,15 @@ static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb) { - struct rfcomm_msc *msc = (void *) skb->data; + struct rfcomm_msc *msc; struct rfcomm_dlc *d; - u8 dlci = __get_dlci(msc->dlci); + u8 dlci; + msc = skb_pull_data(skb, sizeof(*msc)); + if (!msc) + return -EILSEQ; + + dlci = __get_dlci(msc->dlci); BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig); d = rfcomm_dlc_get(s, dlci); @@ -1644,17 +1673,19 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb static int rfcomm_recv_mcc(struct rfcomm_session *s, struct sk_buff *skb) { - struct rfcomm_mcc *mcc = (void *) skb->data; + struct rfcomm_mcc *mcc; u8 type, cr, len; + mcc = skb_pull_data(skb, sizeof(*mcc)); + if (!mcc) + return -EILSEQ; + cr = __test_cr(mcc->type); type = __get_mcc_type(mcc->type); len = __get_mcc_len(mcc->len); BT_DBG("%p type 0x%x cr %d", s, type, cr); - skb_pull(skb, 2); - switch (type) { case RFCOMM_PN: rfcomm_recv_pn(s, cr, skb); @@ -1715,9 +1746,12 @@ static int rfcomm_recv_data(struct rfcomm_session *s, u8 dlci, int pf, struct sk } if (pf && d->cfc) { - u8 credits = *(u8 *) skb->data; skb_pull(skb, 1); + u8 *credits = skb_pull_data(skb, 1); + + if (!credits) + goto drop; - d->tx_credits += credits; + d->tx_credits += *credits; if (d->tx_credits) clear_bit(RFCOMM_TX_THROTTLED, &d->flags); } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index be6639cd6f59..805ed5d28ed6 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -122,7 +122,7 @@ static struct sock *__rfcomm_get_listen_sock_by_addr(u8 channel, bdaddr_t *src) } /* Find socket with channel and source bdaddr. - * Returns closest match. + * Returns closest match with an extra reference held. */ static struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src) { @@ -136,15 +136,25 @@ static struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t * if (rfcomm_pi(sk)->channel == channel) { /* Exact match. */ - if (!bacmp(&rfcomm_pi(sk)->src, src)) + if (!bacmp(&rfcomm_pi(sk)->src, src)) { + sock_hold(sk); break; + } /* Closest match */ - if (!bacmp(&rfcomm_pi(sk)->src, BDADDR_ANY)) + if (!bacmp(&rfcomm_pi(sk)->src, BDADDR_ANY)) { + if (sk1) + sock_put(sk1); + sk1 = sk; + sock_hold(sk1); + } } } + if (sk && sk1) + sock_put(sk1); + read_unlock(&rfcomm_sk_list.lock); return sk ? sk : sk1; @@ -180,6 +190,8 @@ static void rfcomm_sock_cleanup_listen(struct sock *parent) while ((sk = bt_accept_dequeue(parent, NULL))) { rfcomm_sock_close(sk); rfcomm_sock_kill(sk); + /* Drop the reference handed back by bt_accept_dequeue(). */ + sock_put(sk); } parent->sk_state = BT_CLOSED; @@ -497,8 +509,13 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, } nsk = bt_accept_dequeue(sk, newsock); - if (nsk) + if (nsk) { + /* Drop the bridging ref from bt_accept_dequeue(); + * the grafted socket keeps nsk alive from here. + */ + sock_put(nsk); break; + } if (!timeo) { err = -EAGAIN; @@ -934,6 +951,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * { struct sock *sk, *parent; bdaddr_t src, dst; + bool defer_setup = false; int result = 0; BT_DBG("session %p channel %d", s, channel); @@ -947,6 +965,11 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * lock_sock(parent); + if (parent->sk_state != BT_LISTEN) + goto done; + + defer_setup = test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags); + /* Check for backlog size */ if (sk_acceptq_is_full(parent)) { BT_DBG("backlog full %d", parent->sk_ack_backlog); @@ -974,9 +997,11 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * done: release_sock(parent); - if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) + if (defer_setup) parent->sk_state_change(parent); + sock_put(parent); + return result; } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 18826d4b9c0b..140869e5b2df 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -312,11 +312,21 @@ static int sco_connect(struct sock *sk) struct sco_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; + bdaddr_t src, dst; + struct bt_codec codec; + __u16 setting; int err, type; - BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst); + lock_sock(sk); + bacpy(&src, &sco_pi(sk)->src); + bacpy(&dst, &sco_pi(sk)->dst); + setting = sco_pi(sk)->setting; + codec = sco_pi(sk)->codec; + release_sock(sk); - hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); + BT_DBG("%pMR -> %pMR", &src, &dst); + + hdev = hci_get_route(&dst, &src, BDADDR_BREDR); if (!hdev) return -EHOSTUNREACH; @@ -327,7 +337,7 @@ static int sco_connect(struct sock *sk) else type = SCO_LINK; - switch (sco_pi(sk)->setting & SCO_AIRMODE_MASK) { + switch (setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev)) { err = -EOPNOTSUPP; @@ -336,8 +346,8 @@ static int sco_connect(struct sock *sk) break; } - hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, - sco_pi(sk)->setting, &sco_pi(sk)->codec, + hcon = hci_connect_sco(hdev, type, &dst, + setting, &codec, READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); @@ -472,9 +482,13 @@ static struct sock *sco_get_sock_listen(bdaddr_t *src) sk1 = sk; } + sk = sk ? sk : sk1; + if (sk) + sock_hold(sk); + read_unlock(&sco_sk_list.lock); - return sk ? sk : sk1; + return sk; } static void sco_sock_destruct(struct sock *sk) @@ -498,6 +512,8 @@ static void sco_sock_cleanup_listen(struct sock *parent) while ((sk = bt_accept_dequeue(parent, NULL))) { sco_sock_close(sk); sco_sock_kill(sk); + /* Drop the reference handed back by bt_accept_dequeue(). */ + sock_put(sk); } parent->sk_state = BT_CLOSED; @@ -515,11 +531,13 @@ static void sco_sock_kill(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */ + lock_sock(sk); if (sco_pi(sk)->conn) { sco_conn_lock(sco_pi(sk)->conn); sco_pi(sk)->conn->sk = NULL; sco_conn_unlock(sco_pi(sk)->conn); } + release_sock(sk); /* Kill poor orphan */ bt_sock_unlink(&sco_sk_list, sk); @@ -759,8 +777,13 @@ static int sco_sock_accept(struct socket *sock, struct socket *newsock, } ch = bt_accept_dequeue(sk, newsock); - if (ch) + if (ch) { + /* Drop the bridging ref from bt_accept_dequeue(); + * the grafted socket keeps ch alive from here. + */ + sock_put(ch); break; + } if (!timeo) { err = -EAGAIN; @@ -1365,40 +1388,51 @@ static int sco_sock_release(struct socket *sock) static void sco_conn_ready(struct sco_conn *conn) { - struct sock *parent; - struct sock *sk = conn->sk; + struct sock *parent, *sk; + + sco_conn_lock(conn); + sk = sco_sock_hold(conn); + sco_conn_unlock(conn); BT_DBG("conn %p", conn); if (sk) { lock_sock(sk); - sco_sock_clear_timer(sk); - sk->sk_state = BT_CONNECTED; - sk->sk_state_change(sk); + + /* conn->sk may have become NULL if racing with sk close, but + * due to held hdev->lock, it can't become different sk. + */ + if (conn->sk) { + sco_sock_clear_timer(sk); + sk->sk_state = BT_CONNECTED; + sk->sk_state_change(sk); + } + release_sock(sk); + sock_put(sk); } else { - sco_conn_lock(conn); - - if (!conn->hcon) { - sco_conn_unlock(conn); + if (!conn->hcon) return; - } + + lockdep_assert_held(&conn->hcon->hdev->lock); parent = sco_get_sock_listen(&conn->hcon->src); - if (!parent) { - sco_conn_unlock(conn); + if (!parent) return; - } lock_sock(parent); + sco_conn_lock(conn); + + /* hdev->lock guarantees conn->sk == NULL still here */ + + if (parent->sk_state != BT_LISTEN) + goto release; + sk = sco_sock_alloc(sock_net(parent), NULL, BTPROTO_SCO, GFP_ATOMIC, 0); - if (!sk) { - release_sock(parent); - sco_conn_unlock(conn); - return; - } + if (!sk) + goto release; sco_sock_init(sk, parent); @@ -1417,9 +1451,10 @@ static void sco_conn_ready(struct sco_conn *conn) /* Wake up parent */ parent->sk_data_ready(parent); - release_sock(parent); - +release: sco_conn_unlock(conn); + release_sock(parent); + sock_put(parent); } } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 881d866d687a..2eef4f3345cd 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4640,10 +4640,24 @@ static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, rcu_read_unlock(); } -static void br_multicast_del_grps(struct net_bridge *br) +static void br_multicast_enable_all_ports(struct net_bridge *br) { struct net_bridge_port *port; + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + return; + + list_for_each_entry(port, &br->port_list, list) + __br_multicast_enable_port_ctx(&port->multicast_ctx); +} + +static void br_multicast_disable_all_ports(struct net_bridge *br) +{ + struct net_bridge_port *port; + + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + return; + list_for_each_entry(port, &br->port_list, list) __br_multicast_disable_port_ctx(&port->multicast_ctx); } @@ -4651,7 +4665,6 @@ static void br_multicast_del_grps(struct net_bridge *br) int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - struct net_bridge_port *port; bool change_snoopers = false; int err = 0; @@ -4668,7 +4681,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val, br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { change_snoopers = true; - br_multicast_del_grps(br); + br_multicast_disable_all_ports(br); goto unlock; } @@ -4676,8 +4689,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val, goto unlock; br_multicast_open(br); - list_for_each_entry(port, &br->port_list, list) - __br_multicast_enable_port_ctx(&port->multicast_ctx); + br_multicast_enable_all_ports(br); change_snoopers = true; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 0ab1c94db4b9..0a394e5f4391 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -297,7 +297,11 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_ goto free_skb; } - neigh_hh_bridge(&neigh->hh, skb); + if (neigh_hh_bridge(&neigh->hh, skb)) { + neigh_release(neigh); + goto free_skb; + } + skb->dev = br_indev; ret = br_handle_frame_finish(net, sk, skb); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6fd5386a1d64..b9591dd755f9 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1000,19 +1000,25 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], br_port_flags_change(p, changed_mask); if (tb[IFLA_BRPORT_COST]) { + spin_lock_bh(&p->br->lock); err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); + spin_unlock_bh(&p->br->lock); if (err) return err; } if (tb[IFLA_BRPORT_PRIORITY]) { + spin_lock_bh(&p->br->lock); err = br_stp_set_port_priority(p, nla_get_u16(tb[IFLA_BRPORT_PRIORITY])); + spin_unlock_bh(&p->br->lock); if (err) return err; } if (tb[IFLA_BRPORT_STATE]) { + spin_lock_bh(&p->br->lock); err = br_set_port_state(p, nla_get_u8(tb[IFLA_BRPORT_STATE])); + spin_unlock_bh(&p->br->lock); if (err) return err; } @@ -1114,9 +1120,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, if (err) return err; - spin_lock_bh(&p->br->lock); err = br_setport(p, tb, extack); - spin_unlock_bh(&p->br->lock); } else { /* Binary compatibility with old RSTP */ if (nla_len(protinfo) < sizeof(u8)) @@ -1203,17 +1207,10 @@ static int br_port_slave_changelink(struct net_device *brdev, struct nlattr *data[], struct netlink_ext_ack *extack) { - struct net_bridge *br = netdev_priv(brdev); - int ret; - if (!data) return 0; - spin_lock_bh(&br->lock); - ret = br_setport(br_port_get_rtnl(dev), data, extack); - spin_unlock_bh(&br->lock); - - return ret; + return br_setport(br_port_get_rtnl(dev), data, extack); } static int br_port_fill_slave_info(struct sk_buff *skb, @@ -1824,6 +1821,7 @@ static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, int *prividx, int attr) { + unsigned int limit = U16_MAX - nla_total_size(0); struct nlattr *nla __maybe_unused; struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; @@ -1841,6 +1839,7 @@ static int br_fill_linkxstats(struct sk_buff *skb, p = br_port_get_rtnl(dev); if (!p) return 0; + limit -= nla_total_size_64bit(sizeof(p->stp_xstats)); br = p->br; vg = nbp_vlan_group(p); break; @@ -1855,6 +1854,9 @@ static int br_fill_linkxstats(struct sk_buff *skb, if (vg) { u16 pvid; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + limit -= nla_total_size_64bit(sizeof(struct br_mcast_stats)); +#endif pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct bridge_vlan_xstats vxi; @@ -1862,6 +1864,11 @@ static int br_fill_linkxstats(struct sk_buff *skb, if (++vl_idx < *prividx) continue; + + if (skb_tail_pointer(skb) - (unsigned char *)nest + + nla_total_size(sizeof(vxi)) >= limit) + goto nla_put_failure; + memset(&vxi, 0, sizeof(vxi)); vxi.vid = v->vid; vxi.flags = v->flags; diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 18b558a931ad..ee3ad9dfbab9 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -99,7 +99,6 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, attr.u.brport_flags.val = flags; attr.u.brport_flags.mask = mask; - /* We run from atomic context here */ err = call_switchdev_notifiers(SWITCHDEV_PORT_ATTR_SET, p->dev, &info.info, extack); err = notifier_to_errno(err); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 1f57c36a7fc0..d6df81fa0d13 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -86,16 +86,34 @@ static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) return sysfs_emit(buf, "%d\n", p->path_cost); } -static BRPORT_ATTR(path_cost, 0644, - show_path_cost, br_stp_set_path_cost); +static int store_path_cost(struct net_bridge_port *p, unsigned long v) +{ + int ret; + + spin_lock_bh(&p->br->lock); + ret = br_stp_set_path_cost(p, v); + spin_unlock_bh(&p->br->lock); + return ret; +} + +static BRPORT_ATTR(path_cost, 0644, show_path_cost, store_path_cost); static ssize_t show_priority(struct net_bridge_port *p, char *buf) { return sysfs_emit(buf, "%d\n", p->priority); } -static BRPORT_ATTR(priority, 0644, - show_priority, br_stp_set_port_priority); +static int store_priority(struct net_bridge_port *p, unsigned long v) +{ + int ret; + + spin_lock_bh(&p->br->lock); + ret = br_stp_set_port_priority(p, v); + spin_unlock_bh(&p->br->lock); + return ret; +} + +static BRPORT_ATTR(priority, 0644, show_priority, store_priority); static ssize_t show_designated_root(struct net_bridge_port *p, char *buf) { @@ -334,17 +352,13 @@ static ssize_t brport_store(struct kobject *kobj, ret = -ENOMEM; goto out_unlock; } - spin_lock_bh(&p->br->lock); ret = brport_attr->store_raw(p, buf_copy); - spin_unlock_bh(&p->br->lock); kfree(buf_copy); } else if (brport_attr->store) { val = simple_strtoul(buf, &endp, 0); if (endp == buf) goto out_unlock; - spin_lock_bh(&p->br->lock); ret = brport_attr->store(p, val); - spin_unlock_bh(&p->br->lock); } if (!ret) { diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c index 7dfbcdfc30e5..c9e229af0366 100644 --- a/net/bridge/netfilter/ebt_snat.c +++ b/net/bridge/netfilter/ebt_snat.c @@ -31,6 +31,9 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par) const struct arphdr *ap; struct arphdr _ah; + if (skb_ensure_writable(skb, sizeof(_ah) + ETH_ALEN)) + return EBT_DROP; + ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah); if (ap == NULL) return EBT_DROP; diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 741360219552..f05c79f215ea 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -112,24 +112,22 @@ static struct pernet_operations broute_net_ops = { static int __init ebtable_broute_init(void) { - int ret = ebt_register_template(&broute_table, broute_table_init); + int ret = register_pernet_subsys(&broute_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&broute_net_ops); - if (ret) { - ebt_unregister_template(&broute_table); - return ret; - } + ret = ebt_register_template(&broute_table, broute_table_init); + if (ret) + unregister_pernet_subsys(&broute_net_ops); - return 0; + return ret; } static void __exit ebtable_broute_fini(void) { - unregister_pernet_subsys(&broute_net_ops); ebt_unregister_template(&broute_table); + unregister_pernet_subsys(&broute_net_ops); } module_init(ebtable_broute_init); diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index dacd81b12e62..0fc03b07e62a 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -93,24 +93,22 @@ static struct pernet_operations frame_filter_net_ops = { static int __init ebtable_filter_init(void) { - int ret = ebt_register_template(&frame_filter, frame_filter_table_init); + int ret = register_pernet_subsys(&frame_filter_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&frame_filter_net_ops); - if (ret) { - ebt_unregister_template(&frame_filter); - return ret; - } + ret = ebt_register_template(&frame_filter, frame_filter_table_init); + if (ret) + unregister_pernet_subsys(&frame_filter_net_ops); - return 0; + return ret; } static void __exit ebtable_filter_fini(void) { - unregister_pernet_subsys(&frame_filter_net_ops); ebt_unregister_template(&frame_filter); + unregister_pernet_subsys(&frame_filter_net_ops); } module_init(ebtable_filter_init); diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 0f2a8c6118d4..8a10375d8909 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -93,24 +93,22 @@ static struct pernet_operations frame_nat_net_ops = { static int __init ebtable_nat_init(void) { - int ret = ebt_register_template(&frame_nat, frame_nat_table_init); + int ret = register_pernet_subsys(&frame_nat_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&frame_nat_net_ops); - if (ret) { - ebt_unregister_template(&frame_nat); - return ret; - } + ret = ebt_register_template(&frame_nat, frame_nat_table_init); + if (ret) + unregister_pernet_subsys(&frame_nat_net_ops); return ret; } static void __exit ebtable_nat_fini(void) { - unregister_pernet_subsys(&frame_nat_net_ops); ebt_unregister_template(&frame_nat); + unregister_pernet_subsys(&frame_nat_net_ops); } module_init(ebtable_nat_init); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index aea3e19875c6..8a6a069329d2 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -42,6 +42,7 @@ struct ebt_pernet { struct list_head tables; + struct list_head dead_tables; }; struct ebt_template { @@ -1162,11 +1163,6 @@ free_newinfo: static void __ebt_unregister_table(struct net *net, struct ebt_table *table) { - mutex_lock(&ebt_mutex); - list_del(&table->list); - mutex_unlock(&ebt_mutex); - audit_log_nfcfg(table->name, AF_BRIDGE, table->private->nentries, - AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, ebt_cleanup_entry, net, NULL); if (table->private->nentries) @@ -1267,13 +1263,15 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, for (i = 0; i < num_ops; i++) ops[i].priv = table; - list_add(&table->list, &ebt_net->tables); - mutex_unlock(&ebt_mutex); - table->ops = ops; ret = nf_register_net_hooks(net, ops, num_ops); - if (ret) + if (ret) { + synchronize_rcu(); __ebt_unregister_table(net, table); + } else { + list_add(&table->list, &ebt_net->tables); + } + mutex_unlock(&ebt_mutex); audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, AUDIT_XT_OP_REGISTER, GFP_KERNEL); @@ -1339,7 +1337,7 @@ void ebt_unregister_template(const struct ebt_table *t) } EXPORT_SYMBOL(ebt_unregister_template); -static struct ebt_table *__ebt_find_table(struct net *net, const char *name) +void ebt_unregister_table_pre_exit(struct net *net, const char *name) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); struct ebt_table *t; @@ -1348,30 +1346,36 @@ static struct ebt_table *__ebt_find_table(struct net *net, const char *name) list_for_each_entry(t, &ebt_net->tables, list) { if (strcmp(t->name, name) == 0) { + list_move(&t->list, &ebt_net->dead_tables); mutex_unlock(&ebt_mutex); - return t; + nf_unregister_net_hooks(net, t->ops, hweight32(t->valid_hooks)); + return; } } mutex_unlock(&ebt_mutex); - return NULL; -} - -void ebt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct ebt_table *table = __ebt_find_table(net, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(ebt_unregister_table_pre_exit); void ebt_unregister_table(struct net *net, const char *name) { - struct ebt_table *table = __ebt_find_table(net, name); + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + struct ebt_table *t; - if (table) - __ebt_unregister_table(net, table); + mutex_lock(&ebt_mutex); + + list_for_each_entry(t, &ebt_net->dead_tables, list) { + if (strcmp(t->name, name) == 0) { + list_del(&t->list); + audit_log_nfcfg(t->name, AF_BRIDGE, t->private->nentries, + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + __ebt_unregister_table(net, t); + mutex_unlock(&ebt_mutex); + return; + } + } + + mutex_unlock(&ebt_mutex); } /* userspace just supplied us with counters */ @@ -1952,6 +1956,25 @@ enum compat_mwt { EBT_COMPAT_TARGET, }; +static bool match_size_ok(const struct xt_match *match, unsigned int match_size) +{ + u16 csize; + + if (match->matchsize == -1) /* cannot validate ebt_among */ + return true; + + csize = match->compatsize ? : match->matchsize; + + return match_size >= csize; +} + +static bool tgt_size_ok(const struct xt_target *tgt, unsigned int tgt_size) +{ + u16 csize = tgt->compatsize ? : tgt->targetsize; + + return tgt_size >= csize; +} + static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, enum compat_mwt compat_mwt, struct ebt_entries_buf_state *state, @@ -1977,6 +2000,11 @@ static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, if (IS_ERR(match)) return PTR_ERR(match); + if (!match_size_ok(match, match_size)) { + module_put(match->me); + return -EINVAL; + } + off = ebt_compat_match_offset(match, match_size); if (dst) { if (match->compat_from_user) @@ -1996,6 +2024,12 @@ static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, mwt->u.revision); if (IS_ERR(wt)) return PTR_ERR(wt); + + if (!tgt_size_ok(wt, match_size)) { + module_put(wt->me); + return -EINVAL; + } + off = xt_compat_target_offset(wt); if (dst) { @@ -2556,11 +2590,21 @@ static int __net_init ebt_pernet_init(struct net *net) struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); INIT_LIST_HEAD(&ebt_net->tables); + INIT_LIST_HEAD(&ebt_net->dead_tables); return 0; } +static void __net_exit ebt_pernet_exit(struct net *net) +{ + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + + WARN_ON_ONCE(!list_empty(&ebt_net->tables)); + WARN_ON_ONCE(!list_empty(&ebt_net->dead_tables)); +} + static struct pernet_operations ebt_net_ops = { .init = ebt_pernet_init, + .exit = ebt_pernet_exit, .id = &ebt_pernet_id, .size = sizeof(struct ebt_pernet), }; @@ -2569,19 +2613,20 @@ static int __init ebtables_init(void) { int ret; - ret = xt_register_target(&ebt_standard_target); + ret = register_pernet_subsys(&ebt_net_ops); if (ret < 0) return ret; - ret = nf_register_sockopt(&ebt_sockopts); + + ret = xt_register_target(&ebt_standard_target); if (ret < 0) { - xt_unregister_target(&ebt_standard_target); + unregister_pernet_subsys(&ebt_net_ops); return ret; } - ret = register_pernet_subsys(&ebt_net_ops); + ret = nf_register_sockopt(&ebt_sockopts); if (ret < 0) { - nf_unregister_sockopt(&ebt_sockopts); xt_unregister_target(&ebt_standard_target); + unregister_pernet_subsys(&ebt_net_ops); return ret; } diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 692e0b868822..9e64e82d0b63 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -115,6 +115,11 @@ static int __ceph_x_decrypt(const struct ceph_crypto_key *key, int usage_slot, if (ret) return ret; + if (plaintext_len < sizeof(*hdr)) { + pr_err("%s plaintext too small %d\n", __func__, plaintext_len); + return -EINVAL; + } + hdr = p + ceph_crypt_data_offset(key); if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) { pr_err("%s bad magic\n", __func__); diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 254ded0b05f6..521aec1d5fc0 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -47,7 +47,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) { kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_list(struct crush_bucket_list *b) @@ -55,14 +54,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) kfree(b->item_weights); kfree(b->sum_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { kfree(b->h.items); kfree(b->node_weights); - kfree(b); } void crush_destroy_bucket_straw(struct crush_bucket_straw *b) @@ -70,14 +67,12 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) kfree(b->straws); kfree(b->item_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) { kfree(b->item_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket(struct crush_bucket *b) @@ -99,6 +94,7 @@ void crush_destroy_bucket(struct crush_bucket *b) crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b); break; } + kfree(b); } /** diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c89e66d4fcb7..8b5b0587a0cf 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -72,8 +72,7 @@ static int crush_decode_uniform_bucket(void **p, void *end, struct crush_bucket_uniform *b) { dout("crush_decode_uniform_bucket %p to %p\n", *p, end); - ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); - b->item_weight = ceph_decode_32(p); + ceph_decode_32_safe(p, end, b->item_weight, bad); return 0; bad: return -EINVAL; @@ -389,11 +388,15 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c) goto fail; if (arg->ids_size && - arg->ids_size != c->buckets[bucket_index]->size) + (!c->buckets[bucket_index] || + arg->ids_size != c->buckets[bucket_index]->size)) goto e_inval; } - insert_choose_arg_map(&c->choose_args, arg_map); + if (!__insert_choose_arg_map(&c->choose_args, arg_map)) { + ret = -EEXIST; + goto fail; + } } return 0; @@ -516,6 +519,10 @@ static struct crush_map *crush_decode(void *pbyval, void *end) b->id = ceph_decode_32(p); b->type = ceph_decode_16(p); b->alg = ceph_decode_8(p); + if (b->alg != alg) { + b->alg = 0; + goto bad; + } b->hash = ceph_decode_8(p); b->weight = ceph_decode_32(p); b->size = ceph_decode_32(p); @@ -1702,7 +1709,7 @@ static int osdmap_decode(void **p, void *end, bool msgr2, ceph_decode_need(p, end, 3*sizeof(u32) + map->max_osd*(struct_v >= 5 ? sizeof(u32) : sizeof(u8)) + - sizeof(*map->osd_weight), e_inval); + map->max_osd*sizeof(*map->osd_weight), e_inval); if (ceph_decode_32(p) != map->max_osd) goto e_inval; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 14eb7812bda4..ecd659f79fd4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -172,7 +172,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) struct bpf_map *map; smap = rcu_dereference(SDATA(selem)->smap); - if (!(smap->map.map_flags & BPF_F_CLONE)) + if (!smap || !(smap->map.map_flags & BPF_F_CLONE)) continue; /* Note that for lockless listeners adding new element @@ -531,10 +531,10 @@ err_free: } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); -static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) +static int diag_get(struct bpf_local_storage_map *smap, + struct bpf_local_storage_data *sdata, struct sk_buff *skb) { struct nlattr *nla_stg, *nla_value; - struct bpf_local_storage_map *smap; /* It cannot exceed max nlattr's payload */ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE); @@ -543,7 +543,6 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) if (!nla_stg) return -EMSGSIZE; - smap = rcu_dereference(sdata->smap); if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) goto errout; @@ -558,6 +557,7 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) sdata->data, true); else copy_map_value(&smap->map, nla_data(nla_value), sdata->data); + check_and_init_map_value(&smap->map, nla_data(nla_value)); nla_nest_end(skb, nla_stg); return 0; @@ -596,9 +596,11 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, saved_len = skb->len; hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { smap = rcu_dereference(SDATA(selem)->smap); + if (!smap) + continue; diag_size += nla_value_size(smap->map.value_size); - if (nla_stgs && diag_get(SDATA(selem), skb)) + if (nla_stgs && diag_get(smap, SDATA(selem), skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } @@ -665,7 +667,7 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, diag_size += nla_value_size(diag->maps[i]->value_size); - if (nla_stgs && diag_get(sdata, skb)) + if (nla_stgs && diag_get((struct bpf_local_storage_map *)diag->maps[i], sdata, skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } diff --git a/net/core/dev.c b/net/core/dev.c index 06c195906231..0c6c270d9f7d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -371,7 +371,7 @@ static void netdev_name_node_alt_free(struct rcu_head *head) static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) { netdev_name_node_del(name_node); - list_del(&name_node->list); + list_del_rcu(&name_node->list); call_rcu(&name_node->rcu, netdev_name_node_alt_free); } @@ -6862,9 +6862,9 @@ static void skb_defer_free_flush(void) #if defined(CONFIG_NET_RX_BUSY_POLL) -static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +static void __busy_poll_stop(struct napi_struct *napi, unsigned long timeout) { - if (!skip_schedule) { + if (!timeout) { gro_normal_list(&napi->gro); __napi_schedule(napi); return; @@ -6874,6 +6874,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) gro_flush_normal(&napi->gro, HZ >= 1000); clear_bit(NAPI_STATE_SCHED, &napi->state); + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); } enum { @@ -6885,8 +6887,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; - bool skip_schedule = false; - unsigned long timeout; + unsigned long timeout = 0; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6906,10 +6907,12 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); - timeout = napi_get_gro_flush_timeout(napi); - if (napi->defer_hard_irqs_count && timeout) { - hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); - skip_schedule = true; + if (napi->defer_hard_irqs_count) { + /* A short enough gro flush timeout and long enough + * poll can result in timer firing too early. + * Timer will be armed later if necessary. + */ + timeout = napi_get_gro_flush_timeout(napi); } } @@ -6924,7 +6927,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); if (rc == budget) - __busy_poll_stop(napi, skip_schedule); + __busy_poll_stop(napi, timeout); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } diff --git a/net/core/devmem.c b/net/core/devmem.c index 468344739db2..4f71de44c0fb 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -241,6 +241,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, } if (direction == DMA_TO_DEVICE) { + if (!IS_ALIGNED(dmabuf->size, PAGE_SIZE)) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, "TX dma-buf size must be a multiple of PAGE_SIZE"); + goto err_unmap; + } binding->tx_vec = kvmalloc_objs(struct net_iov *, dmabuf->size / PAGE_SIZE); if (!binding->tx_vec) { @@ -267,6 +272,12 @@ net_devmem_bind_dmabuf(struct net_device *dev, size_t len = sg_dma_len(sg); struct net_iov *niov; + if (!IS_ALIGNED(len, PAGE_SIZE)) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, "dma-buf SG length must be PAGE_SIZE aligned"); + goto err_free_chunks; + } + owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, dev_to_node(&dev->dev)); if (!owner) { diff --git a/net/core/failover.c b/net/core/failover.c index 11bb183c7a1b..e43c59cd6868 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -12,6 +12,7 @@ #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> +#include <net/netdev_lock.h> #include <net/failover.h> static LIST_HEAD(failover_list); @@ -221,8 +222,11 @@ failover_existing_slave_register(struct net_device *failover_dev) for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; - if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) + if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) { + netdev_lock_ops(dev); failover_slave_register(dev); + netdev_unlock_ops(dev); + } } rtnl_unlock(); } diff --git a/net/core/filter.c b/net/core/filter.c index 80a3b702a2d4..80439767e0ee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1654,15 +1654,24 @@ err_prog_put: return err; } +static void sk_reuseport_prog_free_rcu(struct rcu_head *rcu) +{ + struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); + struct bpf_prog *prog = aux->prog; + + bpf_release_orig_filter(prog); + bpf_prog_free(prog); +} + void sk_reuseport_prog_free(struct bpf_prog *prog) { if (!prog) return; - if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) - bpf_prog_put(prog); + if (bpf_prog_was_classic(prog)) + call_rcu(&prog->aux->rcu, sk_reuseport_prog_free_rcu); else - bpf_prog_destroy(prog); + bpf_prog_put(prog); } static inline int __bpf_try_make_writable(struct sk_buff *skb, @@ -2860,7 +2869,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, psge->length = start - offset; rsge.length -= psge->length; - rsge.offset += start; + rsge.offset += start - offset; sk_msg_iter_var_next(i); sg_unmark_end(psge); @@ -5481,7 +5490,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!sk_is_tcp(sk)) return -EINVAL; switch (optname) { @@ -5688,6 +5697,30 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_sk_setsockopt_nodelay, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + /* + * TCP_NODELAY triggers tcp_push_pending_frames() and re-enters + * CA_EVENT_TX_START in bpf_tcp_cc. + */ + if (level == SOL_TCP && optname == TCP_NODELAY) + return -EOPNOTSUPP; + + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto = { + .func = bpf_sk_setsockopt_nodelay, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { @@ -5833,6 +5866,12 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; + /* TCP_NODELAY triggers tcp_push_pending_frames() and re-enters these callbacks. */ + if ((bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB || + bpf_sock->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB) && + level == SOL_TCP && optname == TCP_NODELAY) + return -EOPNOTSUPP; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -6443,6 +6482,8 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, * against MTU of FIB lookup resulting net_device */ dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; @@ -7443,7 +7484,7 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) { - if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + if (sk_fullsock(sk) && sk_is_tcp(sk)) return (unsigned long)sk; return (unsigned long)NULL; @@ -11915,7 +11956,7 @@ BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) */ BTF_TYPE_EMIT(struct tcp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && - sk->sk_family == AF_INET6) + sk->sk_type == SOCK_STREAM && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; @@ -11931,7 +11972,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) { - if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + if (sk && sk_fullsock(sk) && sk_is_tcp(sk)) return (unsigned long)sk; return (unsigned long)NULL; diff --git a/net/core/gro.c b/net/core/gro.c index 31d21de5b15a..a84753983467 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -109,6 +109,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (p->pp_recycle != skb->pp_recycle) return -ETOOMANYREFS; + if (skb_zcopy(p) || skb_zcopy(skb)) + return -ETOOMANYREFS; + if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) || NAPI_GRO_CB(skb)->flush)) return -E2BIG; @@ -213,10 +216,12 @@ done: p->data_len += len; p->truesize += delta_truesize; p->len += len; + skb_shinfo(p)->flags |= skbinfo->flags & SKBFL_SHARED_FRAG; if (lp != p) { lp->data_len += len; lp->truesize += delta_truesize; lp->len += len; + skb_shinfo(lp)->flags |= skbinfo->flags & SKBFL_SHARED_FRAG; } NAPI_GRO_CB(skb)->same_flow = 1; return 0; @@ -244,6 +249,8 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) p->truesize += skb->truesize; p->len += skb->len; + skb_shinfo(p)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; + NAPI_GRO_CB(skb)->same_flow = 1; return 0; diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 3e6fde8f1726..23175cb2bd86 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -8,18 +8,21 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } -static inline bool netmem_is_pp(netmem_ref netmem) +static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) +{ + netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; +} + +static inline void netmem_clear_pp_magic(netmem_ref netmem) { - struct page *page; + WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - return PageNetpp(page); + netmem_to_nmdesc(netmem)->pp_magic = 0; +} + +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; } static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4381e0fc25bf..3f4a17fa5713 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -319,6 +319,8 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) lockdep_assert_irqs_disabled(); dev = np->dev; + /* npinfo->txq belongs to np->dev, so retries must stay bound to it. */ + skb->dev = dev; rcu_read_lock(); npinfo = rcu_dereference_bh(dev->npinfo); @@ -608,14 +610,16 @@ EXPORT_SYMBOL_GPL(__netpoll_setup); /* * Returns a pointer to a string representation of the identifier used * to select the egress interface for the given netpoll instance. buf - * must be a buffer of length at least MAC_ADDR_STR_LEN + 1. + * is used to format np->dev_mac when np->dev_name is empty; bufsz must + * be at least MAC_ADDR_STR_LEN + 1 to fit the formatted MAC address + * and its NUL terminator. */ -static char *egress_dev(struct netpoll *np, char *buf) +static char *egress_dev(struct netpoll *np, char *buf, size_t bufsz) { if (np->dev_name[0]) return np->dev_name; - snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac); + snprintf(buf, bufsz, "%pM", np->dev_mac); return buf; } @@ -645,7 +649,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) if (!IS_ENABLED(CONFIG_IPV6)) { np_err(np, "IPv6 is not supported %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EINVAL; } @@ -667,7 +671,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) } if (err) { np_err(np, "no IPv6 address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return err; } @@ -687,14 +691,14 @@ static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev) in_dev = __in_dev_get_rtnl(ndev); if (!in_dev) { np_err(np, "no IP address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EDESTADDRREQ; } ifa = rtnl_dereference(in_dev->ifa_list); if (!ifa) { np_err(np, "no IP address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EDESTADDRREQ; } @@ -736,7 +740,8 @@ int netpoll_setup(struct netpoll *np) ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac); if (!ndev) { - np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf)); + np_err(np, "%s doesn't exist, aborting\n", + egress_dev(np, buf, sizeof(buf))); err = -ENODEV; goto unlock; } @@ -744,14 +749,14 @@ int netpoll_setup(struct netpoll *np) if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); err = -EBUSY; goto put; } if (!netif_running(ndev)) { np_info(np, "device %s not up yet, forcing it\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); err = dev_open(ndev, NULL); if (err) { diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 6e576dec80db..8171d1173221 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -707,18 +707,8 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { - struct page *page; - netmem_set_pp(netmem, pool); - - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - __SetPageNetpp(page); + netmem_or_pp_magic(netmem, PP_SIGNATURE); /* Ensuring all pages have been split into one fragment initially: * page_pool_set_pp_info() is only called once for every page when it @@ -733,17 +723,7 @@ void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) void page_pool_clear_pp_info(netmem_ref netmem) { - struct page *page; - - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - __ClearPageNetpp(page); - + netmem_clear_pp_magic(netmem); netmem_set_pp(netmem, NULL); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b613bb6e07df..511c25bf6f2a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1572,6 +1572,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + memset(&vf_broadcast, 0, sizeof(vf_broadcast)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; @@ -6327,8 +6328,9 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, &filters, &idxattr, &prividx, extack); if (err < 0) { - /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ - WARN_ON(err == -EMSGSIZE); + /* -EMSGSIZE implies BUG in if_nlmsg_stats_size + * or a too big nested attribute. + */ kfree_skb(nskb); } else { err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7dad68e3b518..c02f0a507ba8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2248,6 +2248,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, skb_frag_ref(skb, i); } skb_shinfo(n)->nr_frags = i; + skb_shinfo(n)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; } if (skb_has_frag_list(skb)) { @@ -2786,6 +2787,8 @@ done: skb->data_len = 0; skb_set_tail_pointer(skb, len); } + if (!skb_shinfo(skb)->nr_frags && !skb_has_frag_list(skb)) + skb->unreadable = 0; if (!skb->sk || skb->destructor == sock_edemux) skb_condense(skb); @@ -2793,16 +2796,37 @@ done: } EXPORT_SYMBOL(___pskb_trim); +static int pskb_trim_rcsum_complete(struct sk_buff *skb, unsigned int len) +{ + int delta = skb->len - len; + + if (skb_frags_readable(skb)) { + skb->csum = csum_block_sub(skb->csum, + skb_checksum(skb, len, delta, 0), + len); + return 0; + } + + if (len > skb_headlen(skb)) + return -EFAULT; + + /* The trimmed bytes are unreadable, but the remaining packet can be + * checksummed by software after trimming. + */ + skb->ip_summed = CHECKSUM_NONE; + return 0; +} + /* Note : use pskb_trim_rcsum() instead of calling this directly */ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) { - int delta = skb->len - len; + int err; - skb->csum = csum_block_sub(skb->csum, - skb_checksum(skb, len, delta, 0), - len); + err = pskb_trim_rcsum_complete(skb, len); + if (err) + return err; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; int offset = skb_checksum_start_offset(skb) + skb->csum_offset; @@ -4349,6 +4373,8 @@ onlymerged: tgt->ip_summed = CHECKSUM_PARTIAL; skb->ip_summed = CHECKSUM_PARTIAL; + skb_shinfo(tgt)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; + skb_len_add(skb, -shiftlen); skb_len_add(tgt, shiftlen); @@ -4959,7 +4985,8 @@ normal: skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & + skb_shinfo(nskb)->flags |= (skb_shinfo(head_skb)->flags | + skb_shinfo(frag_skb)->flags) & SKBFL_SHARED_FRAG; if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) @@ -4976,6 +5003,9 @@ normal: nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; + + skb_shinfo(nskb)->flags |= skb_shinfo(frag_skb)->flags & SKBFL_SHARED_FRAG; + if (!skb_headlen(list_skb)) { BUG_ON(!nfrags); } else { @@ -6200,6 +6230,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, from_shinfo->frags, from_shinfo->nr_frags * sizeof(skb_frag_t)); to_shinfo->nr_frags += from_shinfo->nr_frags; + if (from_shinfo->nr_frags) + to_shinfo->flags |= from_shinfo->flags & SKBFL_SHARED_FRAG; if (!skb_cloned(from)) from_shinfo->nr_frags = 0; @@ -6791,6 +6823,11 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_copy_from_linear_data_offset(skb, off, data, new_hlen); skb->len -= off; + /* Remove SKBFL_MANAGED_FRAG_REFS instead of trying to honour it + * while refcounting frags below. + */ + skb_zcopy_downgrade_managed(skb); + memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, @@ -6801,6 +6838,8 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_kfree_head(data); return -ENOMEM; } + if (skb_zcopy(skb)) + net_zcopy_get(skb_zcopy(skb)); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) @@ -6902,6 +6941,11 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, return -ENOMEM; size = SKB_WITH_OVERHEAD(size); + /* Remove SKBFL_MANAGED_FRAG_REFS instead of trying to honour it + * while refcounting frags below. + */ + skb_zcopy_downgrade_managed(skb); + memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); if (skb_orphan_frags(skb, gfp_mask)) { @@ -6944,6 +6988,8 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_kfree_head(data); return -ENOMEM; } + if (skb_zcopy(skb)) + net_zcopy_get(skb_zcopy(skb)); skb_release_data(skb, SKB_CONSUMED); skb->head = data; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 6187a83bd741..e1850caf1a71 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1268,12 +1268,19 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { const struct proto_ops *ops = NULL; + struct sk_psock *psock; struct socket *sock; int copied; trace_sk_data_ready(sk); rcu_read_lock(); + psock = sk_psock(sk); + if (psock && tls_sw_has_ctx_rx(sk)) { + psock->saved_data_ready(sk); + rcu_read_unlock(); + return; + } sock = READ_ONCE(sk->sk_socket); if (likely(sock)) ops = READ_ONCE(sock->ops); @@ -1283,8 +1290,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk) copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { - struct sk_psock *psock; - rcu_read_lock(); psock = sk_psock(sk); if (psock) diff --git a/net/core/sock.c b/net/core/sock.c index b37b664b6eb9..d097025c116a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2676,8 +2676,12 @@ void sock_wfree(struct sk_buff *skb) int old; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { + void (*sk_write_space)(struct sock *sk); + + sk_write_space = READ_ONCE(sk->sk_write_space); + if (sock_flag(sk, SOCK_RCU_FREE) && - sk->sk_write_space == sock_def_write_space) { + sk_write_space == sock_def_write_space) { rcu_read_lock(); free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc, &old); @@ -2693,7 +2697,7 @@ void sock_wfree(struct sk_buff *skb) * after sk_write_space() call */ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); - sk->sk_write_space(sk); + sk_write_space(sk); len = 1; } /* diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 02a68be3002a..99e3789492a0 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1630,18 +1630,23 @@ void sock_map_unhash(struct sock *sk) void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; +retry: rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; + if (unlikely(saved_unhash == sock_map_unhash)) + goto retry; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); + + if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) + return; } - if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) - return; + if (saved_unhash) saved_unhash(sk); } @@ -1652,20 +1657,25 @@ void sock_map_destroy(struct sock *sk) void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; +retry: rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; + if (unlikely(saved_destroy == sock_map_destroy)) + goto retry; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); + + if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) + return; } - if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) - return; + if (saved_destroy) saved_destroy(sk); } @@ -1676,32 +1686,33 @@ void sock_map_close(struct sock *sk, long timeout) void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; +retry: lock_sock(sk); rcu_read_lock(); - psock = sk_psock(sk); + psock = sk_psock_get(sk); if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); - psock = sk_psock_get(sk); - if (unlikely(!psock)) - goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); + + /* Make sure we do not recurse. This is a bug. + * Leak the socket instead of crashing on a stack overflow. + */ + if (WARN_ON_ONCE(saved_close == sock_map_close)) + return; } else { saved_close = READ_ONCE(sk->sk_prot)->close; -no_psock: rcu_read_unlock(); release_sock(sk); + + if (unlikely(saved_close == sock_map_close)) + goto retry; } - /* Make sure we do not recurse. This is a bug. - * Leak the socket instead of crashing on a stack overflow. - */ - if (WARN_ON_ONCE(saved_close == sock_map_close)) - return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); diff --git a/net/devlink/core.c b/net/devlink/core.c index eeb6a71f5f56..fe9f6a0a67d5 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -518,6 +518,8 @@ void devlink_free(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); + devlink_rel_put(devlink); + WARN_ON(!list_empty(&devlink->trap_policer_list)); WARN_ON(!list_empty(&devlink->trap_group_list)); WARN_ON(!list_empty(&devlink->trap_list)); diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index 8bb98d3ea3db..a3a2cc6480a0 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -92,7 +92,7 @@ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, u32 mask; if (end <= start) - return true; + return false; if (start % 32) { mask = ethnl_upper_bits(start); @@ -105,11 +105,11 @@ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, start_word++; } - if (!memchr_inv(map + start_word, '\0', - (end_word - start_word) * sizeof(u32))) + if (memchr_inv(map + start_word, '\0', + (end_word - start_word) * sizeof(u32))) return true; if (end % 32 == 0) - return true; + return false; return map[end_word] & ethnl_lower_bits(end); } diff --git a/net/ethtool/cmis.h b/net/ethtool/cmis.h index 4a9a946cabf0..778783a0f23c 100644 --- a/net/ethtool/cmis.h +++ b/net/ethtool/cmis.h @@ -63,9 +63,9 @@ struct ethtool_cmis_cdb_request { * struct ethtool_cmis_cdb_cmd_args - CDB commands execution arguments * @req: CDB command fields as described in the CMIS standard. * @max_duration: Maximum duration time for command completion in msec. + * @msleep_pre_rpl: Waiting time before checking reply in msec. * @read_write_len_ext: Allowable additional number of byte octets to the LPL * in a READ or a WRITE commands. - * @msleep_pre_rpl: Waiting time before checking reply in msec. * @rpl_exp_len: Expected reply length in bytes. * @flags: Validation flags for CDB commands. * @err_msg: Error message to be sent to user space. @@ -73,8 +73,8 @@ struct ethtool_cmis_cdb_request { struct ethtool_cmis_cdb_cmd_args { struct ethtool_cmis_cdb_request req; u16 max_duration; + u16 msleep_pre_rpl; u8 read_write_len_ext; - u8 msleep_pre_rpl; u8 rpl_exp_len; u8 flags; char *err_msg; diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c index 3670ca42dd40..f3a53a984460 100644 --- a/net/ethtool/cmis_cdb.c +++ b/net/ethtool/cmis_cdb.c @@ -513,8 +513,13 @@ static int cmis_cdb_process_reply(struct net_device *dev, } rpl = (struct ethtool_cmis_cdb_rpl *)page_data->data; - if ((args->rpl_exp_len > rpl->hdr.rpl_len + rpl_hdr_len) || - !rpl->hdr.rpl_chk_code) { + if (rpl->hdr.rpl_len != args->rpl_exp_len) { + netdev_warn(dev, "CDB reply length mismatch, expected %u got %u\n", + args->rpl_exp_len, rpl->hdr.rpl_len); + err = -EIO; + goto out; + } + if (!rpl->hdr.rpl_chk_code) { err = -EIO; goto out; } diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c index df5f344209c4..291d04d2776a 100644 --- a/net/ethtool/cmis_fw_update.c +++ b/net/ethtool/cmis_fw_update.c @@ -44,6 +44,20 @@ enum cmis_cdb_fw_write_mechanism { CMIS_CDB_FW_WRITE_MECHANISM_BOTH = 0x11, }; +/* See section 9.7.2 "CMD 0101h: Start Firmware Download" in CMIS standard + * revision 5.2. + * struct cmis_cdb_start_fw_download_pl is a structured layout of the + * flat array, ethtool_cmis_cdb_request::payload. + */ +struct cmis_cdb_start_fw_download_pl { + __struct_group(cmis_cdb_start_fw_download_pl_h, head, /* no attrs */, + __be32 image_size; + __be32 resv1; + ); + u8 vendor_data[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH - + sizeof(struct cmis_cdb_start_fw_download_pl_h)]; +}; + static int cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, struct net_device *dev, @@ -86,6 +100,14 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, */ cdb->read_write_len_ext = rpl->read_write_len_ext; fw_mng->start_cmd_payload_size = rpl->start_cmd_payload_size; + if (fw_mng->start_cmd_payload_size > + sizeof_field(struct cmis_cdb_start_fw_download_pl, vendor_data)) { + ethnl_module_fw_flash_ntf_err(dev, ntf_params, + "Start cmd payload size exceeds max LPL payload", + NULL); + return -EINVAL; + } + fw_mng->write_mechanism = rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL ? CMIS_CDB_FW_WRITE_MECHANISM_LPL : @@ -97,20 +119,6 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, return 0; } -/* See section 9.7.2 "CMD 0101h: Start Firmware Download" in CMIS standard - * revision 5.2. - * struct cmis_cdb_start_fw_download_pl is a structured layout of the - * flat array, ethtool_cmis_cdb_request::payload. - */ -struct cmis_cdb_start_fw_download_pl { - __struct_group(cmis_cdb_start_fw_download_pl_h, head, /* no attrs */, - __be32 image_size; - __be32 resv1; - ); - u8 vendor_data[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH - - sizeof(struct cmis_cdb_start_fw_download_pl_h)]; -}; - static int cmis_fw_update_start_download(struct ethtool_cmis_cdb *cdb, struct ethtool_cmis_fw_update_params *fw_update, @@ -122,6 +130,14 @@ cmis_fw_update_start_download(struct ethtool_cmis_cdb *cdb, u8 lpl_len; int err; + if (fw_update->fw->size < vendor_data_size) { + ethnl_module_fw_flash_ntf_err(fw_update->dev, + &fw_update->ntf_params, + "Firmware image too small for module's start payload", + NULL); + return -EINVAL; + } + pl.image_size = cpu_to_be32(fw_update->fw->size); memcpy(pl.vendor_data, fw_update->fw->data, vendor_data_size); diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 1e2c5c7048a8..e73fc3e5a02b 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -472,6 +472,12 @@ static int ethnl_update_profile(struct net_device *dev, nla_for_each_nested_type(nest, ETHTOOL_A_PROFILE_IRQ_MODERATION, nests, rem) { + if (i >= NET_DIM_PARAMS_NUM_PROFILES) { + NL_SET_BAD_ATTR(extack, nest); + ret = -E2BIG; + goto err_out; + } + ret = nla_parse_nested(tb, len_irq_moder - 1, nest, coalesce_irq_moderation_policy, extack); diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index a557e3996c85..0b8cfeddb014 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -44,6 +44,9 @@ static int fallback_set_params(struct eeprom_req_info *request, if (offset >= modinfo->eeprom_len) return -EINVAL; + if (length > modinfo->eeprom_len - offset) + return -EINVAL; + eeprom->cmd = ETHTOOL_GMODULEEEPROM; eeprom->len = length; eeprom->offset = offset; @@ -69,7 +72,7 @@ static int eeprom_fallback(struct eeprom_req_info *request, if (err < 0) return err; - data = kmalloc(eeprom.len, GFP_KERNEL); + data = kzalloc(eeprom.len, GFP_KERNEL); if (!data) return -ENOMEM; err = ethtool_get_module_eeprom_call(dev, &eeprom, data); @@ -141,12 +144,11 @@ static int eeprom_prepare_data(const struct ethnl_req_info *req_base, return 0; err_ops: + if (ret == -EOPNOTSUPP) + ret = eeprom_fallback(request, reply); ethnl_ops_complete(dev); err_free: kfree(page_data.data); - - if (ret == -EOPNOTSUPP) - return eeprom_fallback(request, reply); return ret; } diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index 8a5985fd7712..24569e92942c 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -106,10 +106,8 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_LINKSTATE_HEADER, info->extack); - if (IS_ERR(phydev)) { - ret = PTR_ERR(phydev); - goto out; - } + if (IS_ERR(phydev)) + return PTR_ERR(phydev); ret = ethnl_ops_begin(dev); if (ret < 0) diff --git a/net/ethtool/module.c b/net/ethtool/module.c index cad2eb25b5a4..ea4fb2a76650 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -120,12 +120,6 @@ ethnl_set_module_validate(struct ethnl_req_info *req_info, if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]) return 0; - if (req_info->dev->ethtool->module_fw_flash_in_progress) { - NL_SET_ERR_MSG(info->extack, - "Module firmware flashing is in progress"); - return -EBUSY; - } - if (!ops->get_module_power_mode || !ops->set_module_power_mode) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY], @@ -148,6 +142,12 @@ ethnl_set_module(struct ethnl_req_info *req_info, struct genl_info *info) ops = dev->ethtool_ops; + if (dev->ethtool->module_fw_flash_in_progress) { + NL_SET_ERR_MSG(info->extack, + "Module firmware flashing is in progress"); + return -EBUSY; + } + power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); ret = ops->get_module_power_mode(dev, &power, info->extack); if (ret < 0) @@ -221,14 +221,22 @@ static void module_flash_fw_work_list_del(struct list_head *list) static void module_flash_fw_work(struct work_struct *work) { struct ethtool_module_fw_flash *module_fw; + struct net_device *dev; module_fw = container_of(work, struct ethtool_module_fw_flash, work); + dev = module_fw->fw_update.dev; ethtool_cmis_fw_update(&module_fw->fw_update); module_flash_fw_work_list_del(&module_fw->list); - module_fw->fw_update.dev->ethtool->module_fw_flash_in_progress = false; - netdev_put(module_fw->fw_update.dev, &module_fw->dev_tracker); + + rtnl_lock(); + netdev_lock_ops(dev); + dev->ethtool->module_fw_flash_in_progress = false; + netdev_unlock_ops(dev); + rtnl_unlock(); + + netdev_put(dev, &module_fw->dev_tracker); release_firmware(module_fw->fw_update.fw); kfree(module_fw); } @@ -283,11 +291,9 @@ void ethnl_module_fw_flash_sock_destroy(struct ethnl_sock_priv *sk_priv) spin_lock(&module_fw_flash_work_list_lock); list_for_each_entry(work, &module_fw_flash_work_list, list) { - if (work->fw_update.dev == sk_priv->dev && - work->fw_update.ntf_params.portid == sk_priv->portid) { + if (work->fw_update.ntf_params.portid == sk_priv->portid && + dev_net(work->fw_update.dev) == sk_priv->net) work->fw_update.ntf_params.closed_sock = true; - break; - } } spin_unlock(&module_fw_flash_work_list_lock); } @@ -319,14 +325,13 @@ module_flash_fw_schedule(struct net_device *dev, const char *file_name, if (err < 0) goto err_release_firmware; - dev->ethtool->module_fw_flash_in_progress = true; - netdev_hold(dev, &module_fw->dev_tracker, GFP_KERNEL); fw_update->dev = dev; fw_update->ntf_params.portid = info->snd_portid; fw_update->ntf_params.seq = info->snd_seq; fw_update->ntf_params.closed_sock = false; - err = ethnl_sock_priv_set(skb, dev, fw_update->ntf_params.portid, + err = ethnl_sock_priv_set(skb, dev_net(dev), + fw_update->ntf_params.portid, ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH); if (err < 0) goto err_release_firmware; @@ -335,6 +340,9 @@ module_flash_fw_schedule(struct net_device *dev, const char *file_name, if (err < 0) goto err_release_firmware; + dev->ethtool->module_fw_flash_in_progress = true; + netdev_hold(dev, &module_fw->dev_tracker, GFP_KERNEL); + schedule_work(&module_fw->work); return 0; @@ -427,10 +435,11 @@ int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info) ret = ethnl_module_fw_flash_validate(dev, info->extack); if (ret < 0) - goto out_unlock; + goto out_complete; ret = module_flash_fw(dev, tb, skb, info); +out_complete: ethnl_ops_complete(dev); out_unlock: diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 5046023a30b1..7d45f9a884e5 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -53,7 +53,7 @@ const struct nla_policy ethnl_header_policy_phy_stats[] = { [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; -int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, +int ethnl_sock_priv_set(struct sk_buff *skb, struct net *net, u32 portid, enum ethnl_sock_type type) { struct ethnl_sock_priv *sk_priv; @@ -62,7 +62,7 @@ int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, if (IS_ERR(sk_priv)) return PTR_ERR(sk_priv); - sk_priv->dev = dev; + sk_priv->net = net; sk_priv->portid = portid; sk_priv->type = type; diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index aaf6f2468768..fd2198e45d2b 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -318,12 +318,12 @@ enum ethnl_sock_type { }; struct ethnl_sock_priv { - struct net_device *dev; + struct net *net; u32 portid; enum ethnl_sock_type type; }; -int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, +int ethnl_sock_priv_set(struct sk_buff *skb, struct net *net, u32 portid, enum ethnl_sock_type type); /** diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c index d4e6887055ab..ddc6eab701ed 100644 --- a/net/ethtool/phy.c +++ b/net/ethtool/phy.c @@ -76,6 +76,7 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, struct nlattr **tb = info->attrs; struct phy_device_node *pdn; struct phy_device *phydev; + int ret; /* RTNL is held by the caller */ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER, @@ -88,8 +89,19 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, return -EOPNOTSUPP; rep_data->phyindex = phydev->phyindex; + rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL); - rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); + if (!rep_data->name) + return -ENOMEM; + + if (phydev->drv) { + rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); + if (!rep_data->drvname) { + ret = -ENOMEM; + goto err_free_name; + } + } + rep_data->upstream_type = pdn->upstream_type; if (pdn->upstream_type == PHY_UPSTREAM_PHY) { @@ -97,15 +109,33 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, rep_data->upstream_index = upstream->phyindex; } - if (pdn->parent_sfp_bus) + if (pdn->parent_sfp_bus) { rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus), GFP_KERNEL); + if (!rep_data->upstream_sfp_name) { + ret = -ENOMEM; + goto err_free_drvname; + } + } - if (phydev->sfp_bus) + if (phydev->sfp_bus) { rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus), GFP_KERNEL); + if (!rep_data->downstream_sfp_name) { + ret = -ENOMEM; + goto err_free_upstream_sfp; + } + } return 0; + +err_free_upstream_sfp: + kfree(rep_data->upstream_sfp_name); +err_free_drvname: + kfree(rep_data->drvname); +err_free_name: + kfree(rep_data->name); + return ret; } static int phy_fill_reply(struct sk_buff *skb, diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index 2eb9bdc2dcb9..757c9e0cc856 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -62,14 +62,14 @@ static int pse_prepare_data(const struct ethnl_req_info *req_base, struct phy_device *phydev; int ret; - ret = ethnl_ops_begin(dev); - if (ret < 0) - return ret; - phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PSE_HEADER, info->extack); if (IS_ERR(phydev)) - return -ENODEV; + return PTR_ERR(phydev); + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; ret = pse_get_pse_attributes(phydev, info->extack, data); diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 353110b862ab..53792f53f922 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -134,8 +134,7 @@ rss_get_data_alloc(struct net_device *dev, struct rss_reply_data *data) if (!rss_config) return -ENOMEM; - if (data->indir_size) - data->indir_table = (u32 *)rss_config; + data->indir_table = (u32 *)rss_config; if (data->hkey_size) data->hkey = rss_config + indir_bytes; @@ -170,8 +169,10 @@ rss_prepare_get(const struct rss_req_info *request, struct net_device *dev, rxfh.key = data->hkey; ret = ops->get_rxfh(dev, &rxfh); - if (ret) + if (ret) { + rss_get_data_free(data); goto out_unlock; + } data->hfunc = rxfh.hfunc; data->input_xfrm = rxfh.input_xfrm; @@ -686,7 +687,7 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info, ethtool_rxfh_indir_default(i, num_rx_rings); } - *mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size); + *mod |= memcmp(rxfh->indir, data->indir_table, alloc_size); return user_size; @@ -981,11 +982,17 @@ ethnl_rss_create_validate(struct net_device *dev, struct genl_info *info) } static void -ethnl_rss_create_send_ntf(struct sk_buff *rsp, struct net_device *dev) +ethnl_rss_create_send_ntf(const struct sk_buff *rsp, struct net_device *dev) { - struct nlmsghdr *nlh = (void *)rsp->data; struct genlmsghdr *genl_hdr; + struct nlmsghdr *nlh; + struct sk_buff *ntf; + + ntf = skb_copy_expand(rsp, 0, 0, GFP_KERNEL); + if (!ntf) + return; + nlh = nlmsg_hdr(ntf); /* Convert the reply into a notification */ nlh->nlmsg_pid = 0; nlh->nlmsg_seq = ethnl_bcast_seq_next(); @@ -993,7 +1000,7 @@ ethnl_rss_create_send_ntf(struct sk_buff *rsp, struct net_device *dev) genl_hdr = nlmsg_data(nlh); genl_hdr->cmd = ETHTOOL_MSG_RSS_CREATE_NTF; - ethnl_multicast(rsp, dev); + ethnl_multicast(ntf, dev); } int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info) @@ -1099,17 +1106,13 @@ int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info) ntf_fail |= rss_fill_reply(rsp, &req.base, &data.base); if (WARN_ON(!hdr || ntf_fail)) { ret = -EMSGSIZE; - goto exit_unlock; + goto err_remove_ctx; } genlmsg_end(rsp, hdr); - /* Use the same skb for the response and the notification, - * genlmsg_reply() will copy the skb if it has elevated user count. - */ - skb_get(rsp); - ret = genlmsg_reply(rsp, info); ethnl_rss_create_send_ntf(rsp, dev); + ret = genlmsg_reply(rsp, info); rsp = NULL; exit_unlock: @@ -1131,6 +1134,10 @@ exit_free_rsp: nlmsg_free(rsp); return ret; +err_remove_ctx: + if (ops->remove_rxfh_context(dev, ctx, req.rss_context, NULL)) + /* leave the context on failure, like ethnl_rss_delete_doit() */ + goto exit_unlock; err_ctx_id_free: xa_erase(&dev->ethtool->rss_ctx, req.rss_context); err_unlock_free_ctx: @@ -1168,8 +1175,10 @@ int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info) dev = req.dev; ops = dev->ethtool_ops; - if (!ops->create_rxfh_context) + if (!ops->create_rxfh_context) { + ret = -EOPNOTSUPP; goto exit_free_dev; + } rtnl_lock(); netdev_lock_ops(dev); diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index bb1e829ba099..94c4718d31ae 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -311,7 +311,7 @@ static int strset_prepare_data(const struct ethnl_req_info *req_base, return 0; } - phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_HEADER_FLAGS, + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_STRSET_HEADER, info->extack); /* phydev can be NULL, check for errors only */ diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c index e4f518e49d4c..fc4f93cfa459 100644 --- a/net/ethtool/tsconfig.c +++ b/net/ethtool/tsconfig.c @@ -69,8 +69,10 @@ static int tsconfig_prepare_data(const struct ethnl_req_info *req_base, if (ret) goto out; - if (ts_info.phc_index == -1) - return -ENODEV; + if (ts_info.phc_index == -1) { + ret = -ENODEV; + goto out; + } data->hwprov_desc.index = ts_info.phc_index; data->hwprov_desc.qualifier = ts_info.phc_qualifier; @@ -224,16 +226,21 @@ static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info) reply_len = ret + ethnl_reply_header_size(); rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_TSCONFIG_SET_REPLY, ETHTOOL_A_TSCONFIG_HEADER, info, &reply_payload); - if (!rskb) + if (!rskb) { + ret = -ENOMEM; goto err_cleanup; + } ret = tsconfig_fill_reply(rskb, &req_info->base, &reply_data->base); if (ret < 0) - goto err_cleanup; + goto err_free_msg; genlmsg_end(rskb, reply_payload); ret = genlmsg_reply(rskb, info); + rskb = NULL; +err_free_msg: + nlmsg_free(rskb); err_cleanup: kfree(reply_data); kfree(req_info); diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index a865f0fdd26b..14bf01e3b55c 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -83,6 +83,11 @@ tsinfo_parse_request(struct ethnl_req_info *req_base, if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER]) return 0; + if (req_base->flags & ETHTOOL_FLAG_STATS) { + NL_SET_ERR_MSG(extack, "can't query statistics for a provider"); + return -EOPNOTSUPP; + } + return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER], &req->hwprov_desc, extack, &mod); } @@ -402,10 +407,8 @@ static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb, continue; ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); - if (IS_ERR(ehdr)) { - ret = PTR_ERR(ehdr); - goto err; - } + if (IS_ERR(ehdr)) + return PTR_ERR(ehdr); reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier; ret = ops->get_ts_info(dev, &reply_data->ts_info); @@ -523,6 +526,12 @@ int ethnl_tsinfo_start(struct netlink_callback *cb) if (ret < 0) goto free_reply_data; + if (req_info->base.flags & ETHTOOL_FLAG_STATS) { + NL_SET_ERR_MSG(cb->extack, "stats not supported in dump"); + ret = -EOPNOTSUPP; + goto err_dev_put; + } + ctx->req_info = req_info; ctx->reply_data = reply_data; ctx->pos_ifindex = 0; @@ -532,6 +541,8 @@ int ethnl_tsinfo_start(struct netlink_callback *cb) return 0; +err_dev_put: + ethnl_parse_header_dev_put(&req_info->base); free_reply_data: kfree(reply_data); free_req_info: diff --git a/net/handshake/genl.c b/net/handshake/genl.c index 870612609491..4b20cd9cdd0e 100644 --- a/net/handshake/genl.c +++ b/net/handshake/genl.c @@ -10,6 +10,7 @@ #include "genl.h" #include <uapi/linux/handshake.h> +#include <linux/err.h> /* HANDSHAKE_CMD_ACCEPT - do */ static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HANDLER_CLASS + 1] = { @@ -18,7 +19,7 @@ static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HAN /* HANDSHAKE_CMD_DONE - do */ static const struct nla_policy handshake_done_nl_policy[HANDSHAKE_A_DONE_REMOTE_AUTH + 1] = { - [HANDSHAKE_A_DONE_STATUS] = { .type = NLA_U32, }, + [HANDSHAKE_A_DONE_STATUS] = NLA_POLICY_MAX(NLA_U32, MAX_ERRNO), [HANDSHAKE_A_DONE_SOCKFD] = { .type = NLA_S32, }, [HANDSHAKE_A_DONE_REMOTE_AUTH] = { .type = NLA_U32, }, }; diff --git a/net/handshake/genl.h b/net/handshake/genl.h index 8d3e18672daf..46b65f131669 100644 --- a/net/handshake/genl.h +++ b/net/handshake/genl.h @@ -11,6 +11,7 @@ #include <net/genetlink.h> #include <uapi/linux/handshake.h> +#include <linux/err.h> int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info); int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info); diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c index 55442b2f518a..3dd507470d5f 100644 --- a/net/handshake/handshake-test.c +++ b/net/handshake/handshake-test.c @@ -25,7 +25,7 @@ static int test_accept_func(struct handshake_req *req, struct genl_info *info, return 0; } -static void test_done_func(struct handshake_req *req, unsigned int status, +static void test_done_func(struct handshake_req *req, int status, struct genl_info *info) { } @@ -208,6 +208,7 @@ static void handshake_req_submit_test3(struct kunit *test) static void handshake_req_submit_test4(struct kunit *test) { struct handshake_req *req, *result; + unsigned long fcount_before; struct socket *sock; struct file *filp; int err; @@ -224,8 +225,10 @@ static void handshake_req_submit_test4(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, sock->sk); sock->file = filp; + fcount_before = file_count(filp); err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); /* Act */ result = handshake_req_hash_lookup(sock->sk); @@ -235,11 +238,13 @@ static void handshake_req_submit_test4(struct kunit *test) KUNIT_EXPECT_PTR_EQ(test, req, result); handshake_req_cancel(sock->sk); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); } static void handshake_req_submit_test5(struct kunit *test) { + unsigned long fcount_before; struct handshake_req *req; struct handshake_net *hn; struct socket *sock; @@ -265,12 +270,14 @@ static void handshake_req_submit_test5(struct kunit *test) saved = hn->hn_pending; hn->hn_pending = hn->hn_pending_max + 1; + fcount_before = file_count(filp); /* Act */ err = handshake_req_submit(sock, req, GFP_KERNEL); /* Assert */ KUNIT_EXPECT_EQ(test, err, -EAGAIN); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); hn->hn_pending = saved; @@ -279,6 +286,7 @@ static void handshake_req_submit_test5(struct kunit *test) static void handshake_req_submit_test6(struct kunit *test) { struct handshake_req *req1, *req2; + unsigned long fcount_before; struct socket *sock; struct file *filp; int err; @@ -296,21 +304,26 @@ static void handshake_req_submit_test6(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); KUNIT_ASSERT_NOT_NULL(test, sock->sk); sock->file = filp; + fcount_before = file_count(filp); /* Act */ err = handshake_req_submit(sock, req1, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); err = handshake_req_submit(sock, req2, GFP_KERNEL); /* Assert */ KUNIT_EXPECT_EQ(test, err, -EBUSY); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); handshake_req_cancel(sock->sk); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); } static void handshake_req_cancel_test1(struct kunit *test) { + unsigned long fcount_before; struct handshake_req *req; struct socket *sock; struct file *filp; @@ -329,8 +342,10 @@ static void handshake_req_cancel_test1(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); sock->file = filp; + fcount_before = file_count(filp); err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); /* NB: handshake_req hasn't been accepted */ @@ -339,12 +354,14 @@ static void handshake_req_cancel_test1(struct kunit *test) /* Assert */ KUNIT_EXPECT_TRUE(test, result); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); } static void handshake_req_cancel_test2(struct kunit *test) { + unsigned long fcount_before; struct handshake_req *req, *next; struct handshake_net *hn; struct socket *sock; @@ -365,8 +382,10 @@ static void handshake_req_cancel_test2(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); sock->file = filp; + fcount_before = file_count(filp); err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); net = sock_net(sock->sk); hn = handshake_pernet(net); @@ -375,18 +394,24 @@ static void handshake_req_cancel_test2(struct kunit *test) /* Pretend to accept this request */ next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD); KUNIT_ASSERT_PTR_EQ(test, req, next); + /* Simulate FD_PREPARE() consuming the file reference handed + * off by handshake_req_next(); see handshake_nl_accept_doit(). + */ + fput(filp); /* Act */ result = handshake_req_cancel(sock->sk); /* Assert */ KUNIT_EXPECT_TRUE(test, result); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); } static void handshake_req_cancel_test3(struct kunit *test) { + unsigned long fcount_before; struct handshake_req *req, *next; struct handshake_net *hn; struct socket *sock; @@ -407,8 +432,10 @@ static void handshake_req_cancel_test3(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); sock->file = filp; + fcount_before = file_count(filp); err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); net = sock_net(sock->sk); hn = handshake_pernet(net); @@ -417,15 +444,21 @@ static void handshake_req_cancel_test3(struct kunit *test) /* Pretend to accept this request */ next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD); KUNIT_ASSERT_PTR_EQ(test, req, next); + /* Simulate FD_PREPARE() consuming the file reference handed + * off by handshake_req_next(); see handshake_nl_accept_doit(). + */ + fput(filp); /* Pretend to complete this request */ handshake_complete(next, -ETIMEDOUT, NULL); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); /* Act */ result = handshake_req_cancel(sock->sk); /* Assert */ KUNIT_EXPECT_FALSE(test, result); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); } @@ -446,6 +479,7 @@ static struct handshake_proto handshake_req_alloc_proto_destroy = { static void handshake_req_destroy_test1(struct kunit *test) { + unsigned long fcount_before; struct handshake_req *req; struct socket *sock; struct file *filp; @@ -465,10 +499,12 @@ static void handshake_req_destroy_test1(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); sock->file = filp; + fcount_before = file_count(filp); err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); handshake_req_cancel(sock->sk); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); /* Act */ /* Ensure the close/release/put process has run to diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h index a48163765a7a..da61cadd1ad3 100644 --- a/net/handshake/handshake.h +++ b/net/handshake/handshake.h @@ -24,6 +24,7 @@ enum hn_flags_bits { HANDSHAKE_F_NET_DRAINING, }; +struct file; struct handshake_proto; /* One handshake request */ @@ -32,6 +33,7 @@ struct handshake_req { struct rhash_head hr_rhash; unsigned long hr_flags; const struct handshake_proto *hr_proto; + struct file *hr_file; struct sock *hr_sk; void (*hr_odestruct)(struct sock *sk); @@ -57,7 +59,7 @@ struct handshake_proto { int (*hp_accept)(struct handshake_req *req, struct genl_info *info, int fd); void (*hp_done)(struct handshake_req *req, - unsigned int status, + int status, struct genl_info *info); void (*hp_destroy)(struct handshake_req *req); }; @@ -86,7 +88,7 @@ struct handshake_req *handshake_req_hash_lookup(struct sock *sk); struct handshake_req *handshake_req_next(struct handshake_net *hn, int class); int handshake_req_submit(struct socket *sock, struct handshake_req *req, gfp_t flags); -void handshake_complete(struct handshake_req *req, unsigned int status, +void handshake_complete(struct handshake_req *req, int status, struct genl_info *info); bool handshake_req_cancel(struct sock *sk); diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index b989456fc4c5..3fd4fef9bab1 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -92,7 +92,6 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) struct net *net = sock_net(skb->sk); struct handshake_net *hn = handshake_pernet(net); struct handshake_req *req = NULL; - struct socket *sock; int class, err; err = -EOPNOTSUPP; @@ -107,15 +106,13 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) err = -EAGAIN; req = handshake_req_next(hn, class); if (req) { - sock = req->hr_sk->sk_socket; - - FD_PREPARE(fdf, O_CLOEXEC, sock->file); + FD_PREPARE(fdf, O_CLOEXEC, req->hr_file); if (fdf.err) { + fput(req->hr_file); /* drop ref from handshake_req_next() */ err = fdf.err; goto out_complete; } - get_file(sock->file); /* FD_PREPARE() consumes a reference. */ err = req->hr_proto->hp_accept(req, info, fd_prepare_fd(fdf)); if (err) goto out_complete; /* Automatic cleanup handles fput */ @@ -160,7 +157,7 @@ int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info) status = -EIO; if (info->attrs[HANDSHAKE_A_DONE_STATUS]) - status = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]); + status = -(int)nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]); handshake_complete(req, status, info); sockfd_put(sock); @@ -202,21 +199,21 @@ static void __net_exit handshake_net_exit(struct net *net) * accepted and are in progress will be destroyed when * the socket is closed. */ - spin_lock(&hn->hn_lock); + spin_lock_bh(&hn->hn_lock); set_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags); - list_splice_init(&requests, &hn->hn_requests); - spin_unlock(&hn->hn_lock); + list_splice_init(&hn->hn_requests, &requests); + list_for_each_entry(req, &requests, hr_list) + get_file(req->hr_file); + spin_unlock_bh(&hn->hn_lock); while (!list_empty(&requests)) { - req = list_first_entry(&requests, struct handshake_req, hr_list); - list_del(&req->hr_list); - - /* - * Requests on this list have not yet been - * accepted, so they do not have an fd to put. - */ + struct file *file; + req = list_first_entry(&requests, struct handshake_req, hr_list); + file = req->hr_file; + list_del_init(&req->hr_list); handshake_complete(req, -ETIMEDOUT, NULL); + fput(file); } } diff --git a/net/handshake/request.c b/net/handshake/request.c index 2829adbeb149..cd30d54d0501 100644 --- a/net/handshake/request.c +++ b/net/handshake/request.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet.h> +#include <linux/file.h> #include <linux/rhashtable.h> #include <net/sock.h> @@ -162,35 +163,56 @@ static void __remove_pending_locked(struct handshake_net *hn, * otherwise %false. * * If @req was on a pending list, it has not yet been accepted. + * Returns %false when the net namespace is draining; the drain + * loop has taken ownership of the pending list. */ static bool remove_pending(struct handshake_net *hn, struct handshake_req *req) { bool ret = false; - spin_lock(&hn->hn_lock); - if (!list_empty(&req->hr_list)) { + spin_lock_bh(&hn->hn_lock); + if (!test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags) && + !list_empty(&req->hr_list)) { __remove_pending_locked(hn, req); ret = true; } - spin_unlock(&hn->hn_lock); + spin_unlock_bh(&hn->hn_lock); return ret; } +/** + * handshake_req_next - Return the next queued handshake request + * @hn: per-net handshake state + * @class: handler class to match + * + * On a non-NULL return, the caller owns an extra reference + * on @req->hr_file. FD_PREPARE() consumes it on success; on + * the FD_PREPARE() failure path the caller must fput() it. + * + * Return: pointer to a removed handshake_req, or NULL. + */ struct handshake_req *handshake_req_next(struct handshake_net *hn, int class) { struct handshake_req *req, *pos; req = NULL; - spin_lock(&hn->hn_lock); + spin_lock_bh(&hn->hn_lock); list_for_each_entry(pos, &hn->hn_requests, hr_list) { if (pos->hr_proto->hp_handler_class != class) continue; __remove_pending_locked(hn, pos); + /* Hand off a file reference to the accept side under + * hn_lock. A concurrent handshake_req_cancel() can drop + * hr_file before accept reaches FD_PREPARE(); this extra + * reference keeps the file alive until FD_PREPARE() takes + * ownership. + */ + get_file(pos->hr_file); req = pos; break; } - spin_unlock(&hn->hn_lock); + spin_unlock_bh(&hn->hn_lock); return req; } @@ -215,9 +237,16 @@ EXPORT_SYMBOL_IF_KUNIT(handshake_req_next); * A zero return value from handshake_req_submit() means that * exactly one subsequent completion callback is guaranteed. * - * A negative return value from handshake_req_submit() means that - * no completion callback will be done and that @req has been - * destroyed. + * A negative return value from handshake_req_submit() guarantees that + * no completion callback will occur and that @req is no longer owned by + * the caller. If cancellation wins the completion race after the request + * has been published, final destruction is deferred until socket teardown. + * + * The caller must hold a reference on @sock->file for the duration + * of this call. Once the request is published to the accept side, a + * concurrent completion or cancellation may release the request's pin on + * @sock->file; the caller's reference is what keeps @sock->sk valid until + * handshake_req_submit() returns. */ int handshake_req_submit(struct socket *sock, struct handshake_req *req, gfp_t flags) @@ -236,6 +265,14 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, kfree(req); return -EINVAL; } + + /* + * Pin sock->file for the lifetime of the request so the + * accept side does not race a consumer that releases the + * socket while a handshake is pending. + */ + req->hr_file = get_file(sock->file); + req->hr_odestruct = req->hr_sk->sk_destruct; req->hr_sk->sk_destruct = handshake_sk_destruct; @@ -249,7 +286,7 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, if (READ_ONCE(hn->hn_pending) >= hn->hn_pending_max) goto out_err; - spin_lock(&hn->hn_lock); + spin_lock_bh(&hn->hn_lock); ret = -EOPNOTSUPP; if (test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags)) goto out_unlock; @@ -258,7 +295,7 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, goto out_unlock; if (!__add_pending_locked(hn, req)) goto out_unlock; - spin_unlock(&hn->hn_lock); + spin_unlock_bh(&hn->hn_lock); ret = handshake_genl_notify(net, req->hr_proto, flags); if (ret) { @@ -267,35 +304,36 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, goto out_err; } - /* Prevent socket release while a handshake request is pending */ - sock_hold(req->hr_sk); - trace_handshake_submit(net, req, req->hr_sk); return 0; out_unlock: - spin_unlock(&hn->hn_lock); + spin_unlock_bh(&hn->hn_lock); out_err: - /* Restore original destructor so socket teardown still runs on failure */ - req->hr_sk->sk_destruct = req->hr_odestruct; trace_handshake_submit_err(net, req, req->hr_sk, ret); - handshake_req_destroy(req); + if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) { + /* Restore original destructor so socket teardown still runs. */ + req->hr_sk->sk_destruct = req->hr_odestruct; + fput(req->hr_file); + handshake_req_destroy(req); + } return ret; } EXPORT_SYMBOL(handshake_req_submit); -void handshake_complete(struct handshake_req *req, unsigned int status, +void handshake_complete(struct handshake_req *req, int status, struct genl_info *info) { struct sock *sk = req->hr_sk; struct net *net = sock_net(sk); if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) { + struct file *file = req->hr_file; + trace_handshake_complete(net, req, sk, status); req->hr_proto->hp_done(req, status, info); - /* Handshake request is no longer pending */ - sock_put(sk); + fput(file); } } EXPORT_SYMBOL_IF_KUNIT(handshake_complete); @@ -342,8 +380,7 @@ bool handshake_req_cancel(struct sock *sk) out_true: trace_handshake_cancel(net, req, sk); - /* Handshake request is no longer pending */ - sock_put(sk); + fput(req->hr_file); return true; } EXPORT_SYMBOL(handshake_req_cancel); diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index 8f9532a15f43..7567150c2a4f 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -93,7 +93,7 @@ static void tls_handshake_remote_peerids(struct tls_handshake_req *treq, * */ static void tls_handshake_done(struct handshake_req *req, - unsigned int status, struct genl_info *info) + int status, struct genl_info *info) { struct tls_handshake_req *treq = handshake_req_private(req); @@ -104,7 +104,7 @@ static void tls_handshake_done(struct handshake_req *req, if (!status) set_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags); - treq->th_consumer_done(treq->th_consumer_data, -status, + treq->th_consumer_done(treq->th_consumer_data, status, treq->th_peerid[0]); } @@ -425,6 +425,8 @@ EXPORT_SYMBOL(tls_server_hello_psk); * Request cancellation races with request completion. To determine * who won, callers examine the return value from this function. * + * Context: May be called from process or softirq context. + * * Return values: * %true - Uncompleted handshake request was canceled * %false - Handshake request already completed or not found diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 0aca859c88cb..f669a226d728 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -84,7 +84,7 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) /* Get next tlv */ total_length += hsr_sup_tag->tlv.HSR_TLV_length; - if (!pskb_may_pull(skb, total_length)) + if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_tlv))) return false; skb_pull(skb, total_length); hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; @@ -100,7 +100,7 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) /* make sure another tlv follows */ total_length += sizeof(struct hsr_sup_tlv) + hsr_sup_tlv->HSR_TLV_length; - if (!pskb_may_pull(skb, total_length)) + if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_tlv))) return false; /* get next tlv */ diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index d09875b33588..a28dfd8490c5 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -35,10 +35,8 @@ bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) rcu_read_lock(); sn = rcu_dereference(hsr->self_node); - if (!sn) { - WARN_ONCE(1, "HSR: No self node\n"); + if (!sn) goto out; - } if (ether_addr_equal(addr, sn->macaddress_A) || ether_addr_equal(addr, sn->macaddress_B)) @@ -163,8 +161,8 @@ void hsr_del_nodes(struct list_head *node_db) struct hsr_node *tmp; list_for_each_entry_safe(node, tmp, node_db, mac_list) { - list_del(&node->mac_list); - hsr_free_node(node); + list_del_rcu(&node->mac_list); + call_rcu(&node->rcu_head, hsr_free_node_rcu); } } @@ -889,7 +887,10 @@ int hsr_get_node_data(struct hsr_priv *hsr, if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); - *addr_b_ifindex = port->dev->ifindex; + if (port) + *addr_b_ifindex = port->dev->ifindex; + else + *addr_b_ifindex = -1; } else { *addr_b_ifindex = -1; } diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 0c07662b44c0..4df76ff50699 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -255,6 +255,11 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) pr_debug("package xmit\n"); + if (skb->protocol != htons(ETH_P_IPV6)) { + kfree_skb(skb); + return NET_XMIT_DROP; + } + WARN_ON_ONCE(skb->len > IPV6_MIN_MTU); /* We must take a copy of the skb before we modify/replace the ipv6 diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 5fb812443a08..6fd642d2278d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -124,9 +124,14 @@ static void ah_output_done(void *data, int err) struct iphdr *top_iph = ip_hdr(skb); struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); + int seqhi_len = 0; + __be32 *seqhi; + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); iph = AH_SKB_CB(skb)->tmp; - icv = ah_tmp_icv(iph, ihl); + seqhi = (__be32 *)((char *)iph + ihl); + icv = ah_tmp_icv(seqhi, seqhi_len); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; @@ -138,7 +143,7 @@ static void ah_output_done(void *data, int err) } kfree(AH_SKB_CB(skb)->tmp); - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } static int ah_output(struct xfrm_state *x, struct sk_buff *skb) @@ -270,12 +275,17 @@ static void ah_input_done(void *data, int err) struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); int ah_hlen = (ah->hdrlen + 2) << 2; + int seqhi_len = 0; + __be32 *seqhi; if (err) goto out; + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); work_iph = AH_SKB_CB(skb)->tmp; - auth_data = ah_tmp_auth(work_iph, ihl); + seqhi = (__be32 *)((char *)work_iph + ihl); + auth_data = ah_tmp_auth(seqhi, seqhi_len); icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 008edc7f6688..791e15063237 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -168,7 +168,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, */ if (prog_ops_moff(prog) != offsetof(struct tcp_congestion_ops, release)) - return &bpf_sk_setsockopt_proto; + return &bpf_sk_setsockopt_nodelay_proto; return NULL; case BPF_FUNC_getsockopt: /* Since get/setsockopt is usually expected to diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 6dfc0bcdef65..513c8215c947 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -419,8 +419,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } - if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || - ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) + if (ALIGN(skb->data_len + tailen, L1_CACHE_BYTES) > + PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { @@ -873,7 +873,8 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) nfrags = 1; goto skip_cow; - } else if (!skb_has_frag_list(skb)) { + } else if (!skb_has_frag_list(skb) && + !skb_has_shared_frag(skb)) { nfrags = skb_shinfo(skb)->nr_frags; nfrags++; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7eeff658b467..23e921d313b3 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -961,6 +961,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (IS_ERR(rt)) goto out_unlock; + if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + goto ende; + /* peer icmp_ratelimit */ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) goto ende; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a674fb44ec25..a9ad39064f3b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -122,16 +122,29 @@ * contradict to specs provided this delay is small enough. */ -#define IGMP_V1_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ - IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ - ((in_dev)->mr_v1_seen && \ - time_before(jiffies, (in_dev)->mr_v1_seen))) -#define IGMP_V2_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ - IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ - ((in_dev)->mr_v2_seen && \ - time_before(jiffies, (in_dev)->mr_v2_seen))) +static bool IGMP_V1_SEEN(const struct in_device *in_dev) +{ + unsigned long seen; + + if (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1) + return true; + if (IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1) + return true; + seen = READ_ONCE(in_dev->mr_v1_seen); + return seen && time_before(jiffies, seen); +} + +static bool IGMP_V2_SEEN(const struct in_device *in_dev) +{ + unsigned long seen; + + if (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2) + return true; + if (IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2) + return true; + seen = READ_ONCE(in_dev->mr_v2_seen); + return seen && time_before(jiffies, seen); +} static int unsolicited_report_interval(struct in_device *in_dev) { @@ -954,23 +967,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int max_delay; int mark = 0; struct net *net = dev_net(in_dev->dev); - + unsigned long seen; if (len == 8) { + seen = jiffies + READ_ONCE(in_dev->mr_qrv) * READ_ONCE(in_dev->mr_qi) + + READ_ONCE(in_dev->mr_qri); if (ih->code == 0) { /* Alas, old v1 router presents here. */ max_delay = IGMP_QUERY_RESPONSE_INTERVAL; - in_dev->mr_v1_seen = jiffies + - (in_dev->mr_qrv * in_dev->mr_qi) + - in_dev->mr_qri; + WRITE_ONCE(in_dev->mr_v1_seen, seen); group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); - in_dev->mr_v2_seen = jiffies + - (in_dev->mr_qrv * in_dev->mr_qi) + - in_dev->mr_qri; + WRITE_ONCE(in_dev->mr_v2_seen, seen); } /* cancel the interface change timer */ WRITE_ONCE(in_dev->mr_ifc_count, 0); @@ -995,6 +1006,8 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ } else { /* v3 */ + unsigned long mr_qi; + if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return true; @@ -1015,15 +1028,16 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, * received value was zero, use the default or statically * configured value. */ - in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); - in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; - + WRITE_ONCE(in_dev->mr_qrv, + ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv)); + mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; + WRITE_ONCE(in_dev->mr_qi, mr_qi); /* RFC3376, 8.3. Query Response Interval: * The number of seconds represented by the [Query Response * Interval] must be less than the [Query Interval]. */ - if (in_dev->mr_qri >= in_dev->mr_qi) - in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ; + if (READ_ONCE(in_dev->mr_qri) >= mr_qi) + WRITE_ONCE(in_dev->mr_qri, (mr_qi/HZ - 1) * HZ); if (!group) { /* general query */ if (ih3->nsrcs) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 928654c34156..5b934ce8d98a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1108,7 +1108,7 @@ static void reqsk_timer_handler(struct timer_list *t) if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ - __inet_csk_reqsk_queue_drop(sk_listener, nreq, true); + __inet_csk_reqsk_queue_drop(sk_listener, nreq, false); goto no_ownership; } @@ -1134,7 +1134,7 @@ no_ownership: } drop: - __inet_csk_reqsk_queue_drop(sk_listener, oreq, true); + __inet_csk_reqsk_queue_drop(oreq->rsk_listener, oreq, true); reqsk_put(oreq); } @@ -1148,6 +1148,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req) /* The timer needs to be setup after a successful insertion. */ req->timeout = tcp_timeout_init((struct sock *)req); timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); + + preempt_disable_nested(); + mod_timer(&req->rsk_timer, jiffies + req->timeout); /* before letting lookups find us, make sure all req fields @@ -1155,6 +1158,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req) */ smp_wmb(); refcount_set(&req->rsk_refcnt, 2 + 1); + + preempt_enable_nested(); + return true; } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index d8083b9033c2..5b957a831e7c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -179,7 +179,8 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); - if (p) + /* Make sure tree was not modified during our lookup. */ + if (p && !read_seqretry(&base->lock, seq)) return p; /* retry an exact lookup, taking the lock before. diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index be8815ce3ac2..09d745112c15 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -530,6 +530,10 @@ int ip_options_get(struct net *net, struct ip_options_rcu **optp, kfree(opt); return -EINVAL; } + if (opt->opt.srr && !ns_capable(net->user_ns, CAP_NET_RAW)) { + kfree(opt); + return -EPERM; + } kfree(*optp); *optp = opt; return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e4790cc7b5c2..5bcd73cbdb41 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1233,6 +1233,8 @@ alloc_new_skb: if (err < 0) goto error; copy = err; + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 2667f53482bd..d3c677e9bff2 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -212,7 +212,7 @@ EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); */ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) { - const struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph; struct icmphdr *icmph; struct iphdr *niph; struct ethhdr eh; @@ -226,7 +226,6 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); - skb_reset_network_header(skb); err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph)); if (err) @@ -236,7 +235,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN); if (err) return err; - + iph = ip_hdr(skb); icmph = skb_push(skb, sizeof(*icmph)); *icmph = (struct icmphdr) { .type = ICMP_DEST_UNREACH, @@ -281,7 +280,6 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) */ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) { - const struct icmphdr *icmph = icmp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); if (mtu < 576 || iph->frag_off != htons(IP_DF)) @@ -292,9 +290,17 @@ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr)) return 0; - if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type)) - return 0; + if (iph->protocol == IPPROTO_ICMP) { + const struct icmphdr *icmph; + if (!pskb_network_may_pull(skb, iph->ihl * 4 + + offsetofend(struct icmphdr, type))) + return 0; + iph = ip_hdr(skb); + icmph = (void *)iph + iph->ihl * 4; + if (icmp_is_err(icmph->type)) + return 0; + } return iptunnel_pmtud_build_icmp(skb, mtu); } @@ -308,7 +314,7 @@ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) */ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) { - const struct ipv6hdr *ip6h = ipv6_hdr(skb); + const struct ipv6hdr *ip6h; struct icmp6hdr *icmp6h; struct ipv6hdr *nip6h; struct ethhdr eh; @@ -323,7 +329,6 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); - skb_reset_network_header(skb); err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h)); if (err) @@ -334,6 +339,7 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) if (err) return err; + ip6h = ipv6_hdr(skb); icmp6h = skb_push(skb, sizeof(*icmp6h)); *icmp6h = (struct icmp6hdr) { .icmp6_type = ICMPV6_PKT_TOOBIG, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2058ca860294..2628cd3a93a6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -537,15 +537,16 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) }; int err; + rcu_read_lock(); err = ipmr_fib_lookup(net, &fl4, &mrt); if (err < 0) { + rcu_read_unlock(); kfree_skb(skb); return err; } DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); - rcu_read_lock(); /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), @@ -1112,11 +1113,12 @@ static int ipmr_cache_report(const struct mr_table *mrt, msg->im_vif_hi = vifi >> 8; ipv4_pktinfo_prepare(mroute_sk, pkt, false); memcpy(skb->cb, pkt->cb, sizeof(skb->cb)); - /* Add our header */ - igmp = skb_put(skb, sizeof(struct igmphdr)); + /* Add our header. + * Note that code, csum and group fields are cleared. + */ + igmp = skb_put_zero(skb, sizeof(struct igmphdr)); igmp->type = assert; msg->im_msgtype = assert; - igmp->code = 0; ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ skb->transport_header = skb->network_header; } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 97ead883e4a1..ad2259678c78 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1501,13 +1501,11 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len static void __arpt_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; + void *loc_cpu_entry; struct arpt_entry *iter; - private = xt_unregister_table(table); - /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) @@ -1515,6 +1513,7 @@ static void __arpt_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int arpt_register_table(struct net *net, @@ -1522,13 +1521,11 @@ int arpt_register_table(struct net *net, const struct arpt_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1543,7 +1540,7 @@ int arpt_register_table(struct net *net, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct arpt_entry *iter; @@ -1553,46 +1550,12 @@ int arpt_register_table(struct net *net, return PTR_ERR(new_table); } - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - return ret; - -out_free: - __arpt_unregister_table(net, new_table); - return ret; -} - -void arpt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } -EXPORT_SYMBOL(arpt_unregister_table_pre_exit); void arpt_unregister_table(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_ARP, name); if (table) __arpt_unregister_table(net, table); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 78cd5ee24448..370b635e3523 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -43,7 +43,7 @@ static int arptable_filter_table_init(struct net *net) static void __net_exit arptable_filter_net_pre_exit(struct net *net) { - arpt_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_ARP, "filter"); } static void __net_exit arptable_filter_net_exit(struct net *net) @@ -58,32 +58,33 @@ static struct pernet_operations arptable_filter_net_ops = { static int __init arptable_filter_init(void) { - int ret = xt_register_template(&packet_filter, - arptable_filter_table_init); - - if (ret < 0) - return ret; + int ret; arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arpt_do_table); - if (IS_ERR(arpfilter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(arpfilter_ops)) return PTR_ERR(arpfilter_ops); - } ret = register_pernet_subsys(&arptable_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, + arptable_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(arpfilter_ops); - return ret; + unregister_pernet_subsys(&arptable_filter_net_ops); + goto err_free; } + return 0; +err_free: + kfree(arpfilter_ops); return ret; } static void __exit arptable_filter_fini(void) { - unregister_pernet_subsys(&arptable_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&arptable_filter_net_ops); kfree(arpfilter_ops); } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 23c8deff8095..5cbdb0815857 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1704,12 +1704,10 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) static void __ipt_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; struct ipt_entry *iter; - - private = xt_unregister_table(table); + void *loc_cpu_entry; /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; @@ -1718,19 +1716,18 @@ static void __ipt_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1745,7 +1742,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ipt_entry *iter; @@ -1755,51 +1752,12 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return PTR_ERR(new_table); } - /* No template? No need to do anything. This is used by 'nat' table, it registers - * with the nat core instead of the netfilter core. - */ - if (!template_ops) - return 0; - - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - return ret; - -out_free: - __ipt_unregister_table(net, new_table); - return ret; -} - -void ipt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } void ipt_unregister_table_exit(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV4, name); if (table) __ipt_unregister_table(net, table); @@ -1887,7 +1845,6 @@ static void __exit ip_tables_fini(void) } EXPORT_SYMBOL(ipt_register_table); -EXPORT_SYMBOL(ipt_unregister_table_pre_exit); EXPORT_SYMBOL(ipt_unregister_table_exit); EXPORT_SYMBOL(ipt_do_table); module_init(ip_tables_init); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 3ab908b74795..672d7da1071d 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -61,7 +61,7 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "filter"); } static void __net_exit iptable_filter_net_exit(struct net *net) @@ -77,32 +77,33 @@ static struct pernet_operations iptable_filter_net_ops = { static int __init iptable_filter_init(void) { - int ret = xt_register_template(&packet_filter, - iptable_filter_table_init); - - if (ret < 0) - return ret; + int ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table); - if (IS_ERR(filter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(filter_ops)) return PTR_ERR(filter_ops); - } ret = register_pernet_subsys(&iptable_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, + iptable_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(filter_ops); - return ret; + unregister_pernet_subsys(&iptable_filter_net_ops); + goto err_free; } return 0; +err_free: + kfree(filter_ops); + return ret; } static void __exit iptable_filter_fini(void) { - unregister_pernet_subsys(&iptable_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&iptable_filter_net_ops); kfree(filter_ops); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 385d945d8ebe..13d25d9a4610 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -96,7 +96,7 @@ static int iptable_mangle_table_init(struct net *net) static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "mangle"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) @@ -111,32 +111,33 @@ static struct pernet_operations iptable_mangle_net_ops = { static int __init iptable_mangle_init(void) { - int ret = xt_register_template(&packet_mangler, - iptable_mangle_table_init); - if (ret < 0) - return ret; + int ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); - if (IS_ERR(mangle_ops)) { - xt_unregister_template(&packet_mangler); - ret = PTR_ERR(mangle_ops); - return ret; - } + if (IS_ERR(mangle_ops)) + return PTR_ERR(mangle_ops); ret = register_pernet_subsys(&iptable_mangle_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_mangler, + iptable_mangle_table_init); if (ret < 0) { - xt_unregister_template(&packet_mangler); - kfree(mangle_ops); - return ret; + unregister_pernet_subsys(&iptable_mangle_net_ops); + goto err_free; } + return 0; +err_free: + kfree(mangle_ops); return ret; } static void __exit iptable_mangle_fini(void) { - unregister_pernet_subsys(&iptable_mangle_net_ops); xt_unregister_template(&packet_mangler); + unregister_pernet_subsys(&iptable_mangle_net_ops); kfree(mangle_ops); } diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 625a1ca13b1b..a0df72554025 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -119,8 +119,11 @@ static int iptable_nat_table_init(struct net *net) } ret = ipt_nat_register_lookups(net); - if (ret < 0) + if (ret < 0) { + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat"); + synchronize_rcu(); ipt_unregister_table_exit(net, "nat"); + } kfree(repl); return ret; @@ -129,6 +132,7 @@ static int iptable_nat_table_init(struct net *net) static void __net_exit iptable_nat_net_pre_exit(struct net *net) { ipt_nat_unregister_lookups(net); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat"); } static void __net_exit iptable_nat_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 0e7f53964d0a..2745c22f4034 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -53,7 +53,7 @@ static int iptable_raw_table_init(struct net *net) static void __net_exit iptable_raw_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "raw"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "raw"); } static void __net_exit iptable_raw_net_exit(struct net *net) @@ -77,32 +77,32 @@ static int __init iptable_raw_init(void) pr_info("Enabling raw table before defrag\n"); } - ret = xt_register_template(table, - iptable_raw_table_init); - if (ret < 0) - return ret; - rawtable_ops = xt_hook_ops_alloc(table, ipt_do_table); - if (IS_ERR(rawtable_ops)) { - xt_unregister_template(table); + if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); - } ret = register_pernet_subsys(&iptable_raw_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(table, + iptable_raw_table_init); if (ret < 0) { - xt_unregister_template(table); - kfree(rawtable_ops); - return ret; + unregister_pernet_subsys(&iptable_raw_net_ops); + goto err_free; } + return 0; +err_free: + kfree(rawtable_ops); return ret; } static void __exit iptable_raw_fini(void) { + xt_unregister_template(&packet_raw); unregister_pernet_subsys(&iptable_raw_net_ops); kfree(rawtable_ops); - xt_unregister_template(&packet_raw); } module_init(iptable_raw_init); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index d885443cb267..491894511c54 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -50,7 +50,7 @@ static int iptable_security_table_init(struct net *net) static void __net_exit iptable_security_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "security"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "security"); } static void __net_exit iptable_security_net_exit(struct net *net) @@ -65,33 +65,34 @@ static struct pernet_operations iptable_security_net_ops = { static int __init iptable_security_init(void) { - int ret = xt_register_template(&security_table, - iptable_security_table_init); - - if (ret < 0) - return ret; + int ret; sectbl_ops = xt_hook_ops_alloc(&security_table, ipt_do_table); - if (IS_ERR(sectbl_ops)) { - xt_unregister_template(&security_table); + if (IS_ERR(sectbl_ops)) return PTR_ERR(sectbl_ops); - } ret = register_pernet_subsys(&iptable_security_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&security_table, + iptable_security_table_init); if (ret < 0) { - xt_unregister_template(&security_table); - kfree(sectbl_ops); - return ret; + unregister_pernet_subsys(&iptable_security_net_ops); + goto err_free; } + return 0; +err_free: + kfree(sectbl_ops); return ret; } static void __exit iptable_security_fini(void) { + xt_unregister_template(&security_table); unregister_pernet_subsys(&iptable_security_net_ops); kfree(sectbl_ops); - xt_unregister_template(&security_table); } module_init(iptable_security_init); diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index 5080fa5fbf6a..f9c6755f5ec5 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -94,6 +94,9 @@ struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, #endif int doff = 0; + if (ntohs(iph->frag_off) & IP_OFFSET) + return NULL; + if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { struct tcphdr _hdr; struct udphdr *hp; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 5aaf9c62c8e1..68e88cb3e55c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -391,7 +391,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, * in, reject the frame as invalid */ err = -EINVAL; - if (iphlen > length) + if (iphlen > length || iphlen < sizeof(*iph)) goto error_free; if (iphlen >= sizeof(*iph)) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bc1296f0ea69..3d62d45d84bd 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1272,7 +1272,7 @@ static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); - WARN_ON(1); + WARN_ON_ONCE(1); return 0; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d8bdb1bdbff1..c0e85cc171ae 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1705,10 +1705,10 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) { const struct ctl_table *table; - kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); kfree(table); + kfree(net->ipv4.sysctl_local_reserved_ports); } static __net_initdata struct pernet_operations ipv4_sysctl_ops = { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 432fa28e47d4..389a7cc17110 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -299,9 +299,6 @@ enum { DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); -DEFINE_PER_CPU(u32, tcp_tw_isn); -EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); - long sysctl_tcp_mem[3] __read_mostly; DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index a97cdf3e6af4..0a4b38b315fe 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -116,7 +116,8 @@ struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk, { struct tcp_ao_key *key; - hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) { + hlist_for_each_entry_rcu(key, &ao->head, node, + sk_fullsock(sk) && lockdep_sock_is_held(sk)) { if ((sndid >= 0 && key->sndid != sndid) || (rcvid >= 0 && key->rcvid != rcvid)) continue; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d5c9e65d9760..de9f68a9c0cf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7589,6 +7589,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_fastopen_cookie foc = { .len = -1 }; + u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -7599,20 +7600,16 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct dst_entry *dst; struct flowi fl; u8 syncookies; - u32 isn; #ifdef CONFIG_TCP_AO const struct tcp_ao_hdr *aoh; #endif - isn = __this_cpu_read(tcp_tw_isn); - if (isn) { - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - __this_cpu_write(tcp_tw_isn, 0); - } else { + /* If isn is non-zero, this SYN originally matched a TIME_WAIT socket. + * TW sockets are converted to open requests without limitations, + * we skip the queue limits and syncookie checks in the block below. + */ + if (!isn) { syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8fc24c3743c5..fdc81150ff6c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1827,7 +1827,6 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; - struct sock *rsk; reason = psp_sk_rx_policy_check(sk, skb); if (reason) @@ -1863,24 +1862,21 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; if (nsk != sk) { reason = tcp_child_process(sk, nsk, skb); - if (reason) { - rsk = nsk; + sock_put(nsk); + if (reason) goto reset; - } return 0; } } else sock_rps_save_rxhash(sk, skb); reason = tcp_rcv_state_process(sk, skb); - if (reason) { - rsk = sk; + if (reason) goto reset; - } return 0; reset: - tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); + tcp_v4_send_reset(sk, skb, sk_rst_convert_drop_reason(reason)); discard: sk_skb_reason_drop(sk, skb, reason); /* Be careful here. If this function gets more complicated and @@ -2193,13 +2189,16 @@ lookup: rst_reason = sk_rst_convert_drop_reason(drop_reason); tcp_v4_send_reset(nsk, skb, rst_reason); + sock_put(nsk); goto discard_and_relse; } + sock_put(nsk); sock_put(sk); return 0; } } + isn = 0; process: if (static_branch_unlikely(&ip4_min_ttl)) { /* min_ttl can be changed concurrently from do_ip_setsockopt() */ @@ -2229,6 +2228,7 @@ process: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); + TCP_SKB_CB(skb)->tcp_tw_isn = isn; skb->dev = NULL; @@ -2315,7 +2315,6 @@ do_time_wait: sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; - __this_cpu_write(tcp_tw_isn, isn); goto process; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 199f0b579e89..e6092c3ac840 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -1012,6 +1012,6 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, } bh_unlock_sock(child); - sock_put(child); + return reason; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f9d8755705f7..6e4bb411dc04 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2626,6 +2626,7 @@ static int tcp_clone_payload(struct sock *sk, struct sk_buff *to, todo = min_t(int, skb_frag_size(fragfrom), probe_size - len); len += todo; + skb_shinfo(to)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; if (lastfrag && skb_frag_page(fragfrom) == skb_frag_page(lastfrag) && skb_frag_off(fragfrom) == skb_frag_off(lastfrag) + diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0ac2bf4f8759..70f6cbd4ef73 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2011,6 +2011,14 @@ try_again: } WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); + + /* + * skb->dev still aliases the UDP rx dev_scratch (its charge was freed + * on dequeue above); a sockmap verdict program may deref it via + * bpf_sk_lookup_*(), so clear it -> bpf_skc_lookup() uses skb->sk + */ + skb->dev = NULL; + return recv_actor(sk, skb); } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a0813d425b71..29651b1a0bc7 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -482,11 +482,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, struct sock *sk = gso_skb->sk; unsigned int sum_truesize = 0; struct sk_buff *segs, *seg; - __be16 newlen, msslen; struct udphdr *uh; unsigned int mss; bool copy_dtor; __sum16 check; + __be16 newlen; int ret = 0; mss = skb_shinfo(gso_skb)->gso_size; @@ -555,15 +555,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, return segs; } - msslen = htons(sizeof(*uh) + mss); - - /* GSO partial and frag_list segmentation only requires splitting - * the frame into an MSS multiple and possibly a remainder, both - * cases return a GSO skb. So update the mss now. - */ - if (skb_is_gso(segs)) - mss *= skb_shinfo(segs)->gso_segs; - seg = segs; uh = udp_hdr(seg); @@ -586,7 +577,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (!seg->next) break; - uh->len = msslen; + uh->len = newlen; uh->check = check; if (seg->ip_summed == CHECKSUM_PARTIAL) @@ -599,9 +590,12 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, uh = udp_hdr(seg); } - /* last packet can be partial gso_size, account for that in checksum */ - newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) + - seg->data_len); + /* Unless skb fits perfectly as GSO_PARTIAL, the trailing + * segment may not be full MSS, account for that in the checksum + */ + if (!skb_is_gso(seg)) + newlen = htons(skb_tail_pointer(seg) - + skb_transport_header(seg) + seg->data_len); check = csum16_add(csum16_sub(uh->check, uh->len), newlen); uh->len = newlen; diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index c024aa77f25b..c3806c6ac96f 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -164,7 +164,7 @@ config IPV6_SIT select INET_TUNNEL select NET_IP_TUNNEL select IPV6_NDISC_NODETYPE - default y + default m help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -172,7 +172,7 @@ config IPV6_SIT into IPv4 packets. This is useful if you want to connect two IPv6 networks over an IPv4-only path. - Saying M here will produce a module called sit. If unsure, say Y. + Saying M here will produce a module called sit. If unsure, say M. config IPV6_SIT_6RD bool "IPv6: IPv6 Rapid Deployment (6RD)" diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5476b6536eb7..bb84a78b80f6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1013,7 +1013,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) list_for_each(p, &idev->addr_list) { struct inet6_ifaddr *ifa = list_entry(p, struct inet6_ifaddr, if_list); - if (ifp_scope > ipv6_addr_src_scope(&ifa->addr)) + if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr)) break; } diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index cb26beea4398..76f7a2de9108 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -317,14 +317,19 @@ static void ah6_output_done(void *data, int err) struct ipv6hdr *top_iph = ipv6_hdr(skb); struct ip_auth_hdr *ah = ip_auth_hdr(skb); struct tmp_ext *iph_ext; + int seqhi_len = 0; + __be32 *seqhi; extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr); if (extlen) extlen += sizeof(*iph_ext); + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); iph_base = AH_SKB_CB(skb)->tmp; iph_ext = ah_tmp_ext(iph_base); - icv = ah_tmp_icv(iph_ext, extlen); + seqhi = (__be32 *)((char *)iph_ext + extlen); + icv = ah_tmp_icv(seqhi, seqhi_len); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); @@ -332,7 +337,7 @@ static void ah6_output_done(void *data, int err) ah6_restore_hdrs(top_iph, iph_ext, extlen); kfree(AH_SKB_CB(skb)->tmp); - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) @@ -471,13 +476,18 @@ static void ah6_input_done(void *data, int err) struct ip_auth_hdr *ah = ip_auth_hdr(skb); int hdr_len = skb_network_header_len(skb); int ah_hlen = ipv6_authlen(ah); + int seqhi_len = 0; + __be32 *seqhi; if (err) goto out; + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, hdr_len); - icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); + seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len); + icv = ah_tmp_icv(seqhi, seqhi_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 67a42e01dfc3..be6dac8a8566 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -243,16 +243,16 @@ static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca) { unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr); - spin_lock(&acaddr_hash_lock); + spin_lock_bh(&acaddr_hash_lock); hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]); - spin_unlock(&acaddr_hash_lock); + spin_unlock_bh(&acaddr_hash_lock); } static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca) { - spin_lock(&acaddr_hash_lock); + spin_lock_bh(&acaddr_hash_lock); hlist_del_init_rcu(&aca->aca_addr_lst); - spin_unlock(&acaddr_hash_lock); + spin_unlock_bh(&acaddr_hash_lock); } static void aca_get(struct ifacaddr6 *aca) @@ -371,10 +371,10 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) aca->aca_next = idev->ac_list; rcu_assign_pointer(idev->ac_list, aca); - write_unlock_bh(&idev->lock); - ipv6_add_acaddr_hash(net, aca); + write_unlock_bh(&idev->lock); + ip6_ins_rt(net, f6i); addrconf_join_solict(idev->dev, &aca->aca_addr); @@ -649,8 +649,8 @@ void ipv6_anycast_cleanup(void) { int i; - spin_lock(&acaddr_hash_lock); + spin_lock_bh(&acaddr_hash_lock); for (i = 0; i < IN6_ADDR_HSIZE; i++) WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); - spin_unlock(&acaddr_hash_lock); + spin_unlock_bh(&acaddr_hash_lock); } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index ca3605acb433..38d7b4845281 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -617,6 +617,18 @@ void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, } } +static u16 ipv6_get_exthdr_len(const struct sk_buff *skb, const u8 *ptr) +{ + u16 len; + + if (ptr + 2 > skb_tail_pointer(skb)) + return 0; + + len = (ptr[1] + 1) << 3; + + return (len <= skb_tail_pointer(skb) - ptr) ? len : 0; +} + void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { @@ -643,7 +655,10 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, /* HbH is allowed only once */ if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); - put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); + u16 len = ipv6_get_exthdr_len(skb, ptr); + + if (len) + put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, len, ptr); } if (opt->lastopt && @@ -664,26 +679,37 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, unsigned int len; u8 *ptr = nh + off; + if (ptr + 2 > skb_tail_pointer(skb)) + return; + switch (nexthdr) { case IPPROTO_DSTOPTS: nexthdr = ptr[0]; - len = (ptr[1] + 1) << 3; + len = ipv6_get_exthdr_len(skb, ptr); + if (!len) + return; if (np->rxopt.bits.dstopts) put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr); break; case IPPROTO_ROUTING: nexthdr = ptr[0]; - len = (ptr[1] + 1) << 3; + len = ipv6_get_exthdr_len(skb, ptr); + if (!len) + return; if (np->rxopt.bits.srcrt) put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr); break; case IPPROTO_AH: nexthdr = ptr[0]; len = (ptr[1] + 2) << 2; + if (ptr + len > skb_tail_pointer(skb)) + return; break; default: nexthdr = ptr[0]; - len = (ptr[1] + 1) << 3; + len = ipv6_get_exthdr_len(skb, ptr); + if (!len) + return; break; } @@ -705,19 +731,31 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, } if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); - put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); + u16 len = ipv6_get_exthdr_len(skb, ptr); + + if (len) + put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, len, ptr); } if (np->rxopt.bits.odstopts && opt->dst0) { u8 *ptr = nh + opt->dst0; - put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); + u16 len = ipv6_get_exthdr_len(skb, ptr); + + if (len) + put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, len, ptr); } if (np->rxopt.bits.osrcrt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt); - put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr); + u16 len = ipv6_get_exthdr_len(skb, (u8 *)rthdr); + + if (len) + put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, len, rthdr); } if (np->rxopt.bits.odstopts && opt->dst1) { u8 *ptr = nh + opt->dst1; - put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); + u16 len = ipv6_get_exthdr_len(skb, ptr); + + if (len) + put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, len, ptr); } if (np->rxopt.bits.rxorigdstaddr) { struct sockaddr_in6 sin6; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 9f75313734f8..57481e423e59 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -448,8 +448,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } - if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || - ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) + if (ALIGN(skb->data_len + tailen, L1_CACHE_BYTES) > + PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { @@ -915,7 +915,8 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) nfrags = 1; goto skip_cow; - } else if (!skb_has_frag_list(skb)) { + } else if (!skb_has_frag_list(skb) && + !skb_has_shared_frag(skb)) { nfrags = skb_shinfo(skb)->nr_frags; nfrags++; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 03cbce842c1a..43f46ef9c53b 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -184,6 +184,8 @@ static bool ip6_parse_tlv(bool hopbyhop, case IPV6_TLV_JUMBO: if (!ipv6_hop_jumbo(skb, off)) return false; + + nh = skb_network_header(skb); break; case IPV6_TLV_CALIPSO: if (!ipv6_hop_calipso(skb, off)) @@ -201,6 +203,8 @@ static bool ip6_parse_tlv(bool hopbyhop, case IPV6_TLV_HAO: if (!ipv6_dest_hao(skb, off)) return false; + + nh = skb_network_header(skb); break; #endif default: @@ -544,7 +548,7 @@ looped_back: * unsigned char which is segments_left field. Should not be * higher than that. */ - if (r || (n + 1) > 255) { + if (r || (n + 1) > 127) { kfree_skb(skb); return -1; } @@ -910,16 +914,27 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) { + enum skb_drop_reason drop_reason; struct ioam6_trace_hdr *trace; struct ioam6_namespace *ns; + struct inet6_dev *idev; struct ioam6_hdr *hdr; + drop_reason = SKB_DROP_REASON_IP_INHDR; + /* Bad alignment (must be 4n-aligned) */ if (optoff & 3) goto drop; + /* Does the device still have IPv6 configuration? */ + idev = __in6_dev_get(skb->dev); + if (!idev) { + drop_reason = SKB_DROP_REASON_IPV6DISABLED; + goto drop; + } + /* Ignore if IOAM is not enabled on ingress */ - if (!READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_enabled)) + if (!READ_ONCE(idev->cnf.ioam6_enabled)) goto ignore; /* Truncated Option header */ @@ -955,9 +970,9 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) if (skb_ensure_writable(skb, optoff + 2 + hdr->opt_len)) goto drop; - /* Trace pointer may have changed */ - trace = (struct ioam6_trace_hdr *)(skb_network_header(skb) - + optoff + sizeof(*hdr)); + /* Trace and hdr pointers may have changed */ + hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff); + trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr)); ioam6_fill_trace_data(skb, ns, trace, true); @@ -972,7 +987,7 @@ ignore: return true; drop: - kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); + kfree_skb_reason(skb, drop_reason); return false; } diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 49e31e4ae7b7..9d06d487e8b1 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -73,6 +73,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, __be16 *frag_offp) { u8 nexthdr = *nexthdrp; + int exthdr_cnt = 0; *frag_offp = 0; @@ -82,6 +83,8 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, if (nexthdr == NEXTHDR_NONE) return -1; + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) + return -1; hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -1; @@ -190,6 +193,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; + int exthdr_cnt = 0; bool found; if (fragoff) @@ -216,6 +220,9 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, return -ENOENT; } + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) + return -EBADMSG; + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -EBADMSG; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index c92f98c6f6ec..b1ccdf0dc646 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -36,11 +36,11 @@ /* FL hash table */ #define FL_MAX_PER_SOCK 32 -#define FL_MAX_SIZE 4096 +#define FL_MAX_SIZE 8192 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) -static atomic_t fl_size = ATOMIC_INIT(0); +static int fl_size; static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); @@ -162,8 +162,9 @@ static void ip6_fl_gc(struct timer_list *unused) ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; + fl_size--; + fl->fl_net->ipv6.flowlabel_count--; fl_free(fl); - atomic_dec(&fl_size); continue; } if (!sched || time_before(ttd, sched)) @@ -172,7 +173,7 @@ static void ip6_fl_gc(struct timer_list *unused) flp = &fl->next; } } - if (!sched && atomic_read(&fl_size)) + if (!sched && fl_size) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); @@ -196,7 +197,8 @@ static void __net_exit ip6_fl_purge(struct net *net) atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); - atomic_dec(&fl_size); + fl_size--; + net->ipv6.flowlabel_count--; continue; } flp = &fl->next; @@ -210,10 +212,10 @@ static struct ip6_flowlabel *fl_intern(struct net *net, { struct ip6_flowlabel *lfl; + lockdep_assert_held(&ip6_fl_lock); + fl->label = label & IPV6_FLOWLABEL_MASK; - rcu_read_lock(); - spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; @@ -235,8 +237,6 @@ static struct ip6_flowlabel *fl_intern(struct net *net, lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); - spin_unlock_bh(&ip6_fl_lock); - rcu_read_unlock(); return lfl; } } @@ -244,9 +244,8 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); - atomic_inc(&fl_size); - spin_unlock_bh(&ip6_fl_lock); - rcu_read_unlock(); + fl_size++; + net->ipv6.flowlabel_count++; return NULL; } @@ -464,10 +463,17 @@ done: static int mem_check(struct sock *sk) { - int room = FL_MAX_SIZE - atomic_read(&fl_size); + const int unpriv_total_limit = FL_MAX_SIZE - (FL_MAX_SIZE / 4); + const int unpriv_user_limit = unpriv_total_limit / 2; + struct net *net = sock_net(sk); + int room; struct ipv6_fl_socklist *sfl; int count = 0; + lockdep_assert_held(&ip6_fl_lock); + + room = FL_MAX_SIZE - fl_size; + if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; @@ -478,7 +484,9 @@ static int mem_check(struct sock *sk) if (room <= 0 || ((count >= FL_MAX_PER_SOCK || - (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && + (count > 0 && room < FL_MAX_SIZE / 2) || + room < FL_MAX_SIZE / 4 || + net->ipv6.flowlabel_count >= unpriv_user_limit) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; @@ -692,11 +700,19 @@ release: if (!sfl1) goto done; + rcu_read_lock(); + spin_lock_bh(&ip6_fl_lock); err = mem_check(sk); + if (err == 0) + fl1 = fl_intern(net, fl, freq->flr_label); + else + fl1 = NULL; + spin_unlock_bh(&ip6_fl_lock); + rcu_read_unlock(); + if (err != 0) goto done; - fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 63fc8556b475..365b4059eb20 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -2262,10 +2262,11 @@ static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id); + struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; - struct ip6_tnl *t; + struct ip6gre_net *ign; + ign = net_generic(t->net, ip6gre_net_id); t = ip6gre_changelink_common(dev, tb, data, &p, extack); if (IS_ERR(t)) return PTR_ERR(t); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 967b07aeb683..8972863c93ee 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -403,6 +403,7 @@ INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *)); void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, bool have_final) { + int exthdr_cnt = IP6CB(skb)->flags & IP6SKB_HOPBYHOP ? 1 : 0; const struct inet6_protocol *ipprot; struct inet6_dev *idev; unsigned int nhoff; @@ -487,6 +488,10 @@ resubmit_final: nexthdr = ret; goto resubmit_final; } else { + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) { + SKB_DR_SET(reason, IPV6_TOO_MANY_EXTHDRS); + goto discard; + } goto resubmit; } } else if (ret == 0) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7e92909ab5be..c14adcdd4396 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -468,6 +468,7 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) default: break; } + hdr = ipv6_hdr(skb); } /* @@ -582,6 +583,8 @@ int ip6_forward(struct sk_buff *skb) if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { int proxied = ip6_forward_proxy_check(skb); + + hdr = ipv6_hdr(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit * here by 1, as we do at the end of the @@ -1794,6 +1797,8 @@ alloc_new_skb: if (err < 0) goto error; copy = err; + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index c468c83af0f2..9d1037ac082f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -399,11 +399,15 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; + int exthdr_cnt = 0; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) + break; + if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index ad5290be4dd6..df793c8bfffb 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -722,10 +722,11 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p, static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p, bool keep_mtu) { - struct net *net = dev_net(t->dev); - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct net *net = t->net; + struct vti6_net *ip6n; int err; + ip6n = net_generic(net, vti6_net_id); vti6_tnl_unlink(ip6n, t); synchronize_net(); err = vti6_tnl_change(t, p, keep_mtu); @@ -834,17 +835,24 @@ vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data if (p.proto != IPPROTO_IPV6 && p.proto != 0) break; vti6_parm_from_user(&p1, &p); - t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL); if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { + struct ip6_tnl *self = netdev_priv(dev); + + err = -EPERM; + if (!ns_capable(self->net->user_ns, CAP_NET_ADMIN)) + break; + t = vti6_locate(self->net, &p1, false); if (t) { if (t->dev != dev) { err = -EEXIST; break; } } else - t = netdev_priv(dev); + t = self; err = vti6_update(t, &p1, false); + } else { + t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL); } if (t) { err = 0; @@ -1031,11 +1039,12 @@ static int vti6_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip6_tnl *t; + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; struct __ip6_tnl_parm p; - struct net *net = dev_net(dev); - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n; + ip6n = net_generic(net, vti6_net_id); if (dev == ip6n->fb_tnl_dev) return -EINVAL; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 3330adcf26db..d9b855d5191b 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1424,9 +1424,9 @@ out: static void __mld_query_work(struct sk_buff *skb) { struct mld2_query *mlh2 = NULL; - const struct in6_addr *group; unsigned long max_delay; struct inet6_dev *idev; + struct in6_addr group; struct ifmcaddr6 *ma; struct mld_msg *mld; int group_type; @@ -1458,8 +1458,8 @@ static void __mld_query_work(struct sk_buff *skb) goto kfree_skb; mld = (struct mld_msg *)icmp6_hdr(skb); - group = &mld->mld_mca; - group_type = ipv6_addr_type(group); + group = mld->mld_mca; + group_type = ipv6_addr_type(&group); if (group_type != IPV6_ADDR_ANY && !(group_type&IPV6_ADDR_MULTICAST)) @@ -1509,7 +1509,7 @@ static void __mld_query_work(struct sk_buff *skb) } } else { for_each_mc_mclock(idev, ma) { - if (!ipv6_addr_equal(group, &ma->mca_addr)) + if (!ipv6_addr_equal(&group, &ma->mca_addr)) continue; if (ma->mca_flags & MAF_TIMER_RUNNING) { /* gsquery <- gsquery && mark */ diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index d585ac3c1113..9d9c3763f2f5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1713,12 +1713,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) static void __ip6t_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; struct ip6t_entry *iter; - - private = xt_unregister_table(table); + void *loc_cpu_entry; /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; @@ -1727,19 +1725,18 @@ static void __ip6t_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1754,7 +1751,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ip6t_entry *iter; @@ -1764,48 +1761,12 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return PTR_ERR(new_table); } - if (!template_ops) - return 0; - - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - return ret; - -out_free: - __ip6t_unregister_table(net, new_table); - return ret; -} - -void ip6t_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } void ip6t_unregister_table_exit(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV6, name); if (table) __ip6t_unregister_table(net, table); @@ -1894,7 +1855,6 @@ static void __exit ip6_tables_fini(void) } EXPORT_SYMBOL(ip6t_register_table); -EXPORT_SYMBOL(ip6t_unregister_table_pre_exit); EXPORT_SYMBOL(ip6t_unregister_table_exit); EXPORT_SYMBOL(ip6t_do_table); diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c index e7a3fb9355ee..450dd53846a2 100644 --- a/net/ipv6/netfilter/ip6t_hbh.c +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -168,6 +168,10 @@ static int hbh_mt6_check(const struct xt_mtchk_param *par) pr_debug("unknown flags %X\n", optsinfo->invflags); return -EINVAL; } + if (optsinfo->optsnr > IP6T_OPTS_OPTSNR) { + pr_debug("too many supported opts specified\n"); + return -EINVAL; + } if (optsinfo->flags & IP6T_OPTS_NSTRICT) { pr_debug("Not strict - not implemented"); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index e8992693e14a..b074fc477676 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -60,7 +60,7 @@ static int __net_init ip6table_filter_net_init(struct net *net) static void __net_exit ip6table_filter_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "filter"); } static void __net_exit ip6table_filter_net_exit(struct net *net) @@ -76,32 +76,32 @@ static struct pernet_operations ip6table_filter_net_ops = { static int __init ip6table_filter_init(void) { - int ret = xt_register_template(&packet_filter, - ip6table_filter_table_init); - - if (ret < 0) - return ret; + int ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table); - if (IS_ERR(filter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(filter_ops)) return PTR_ERR(filter_ops); - } ret = register_pernet_subsys(&ip6table_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, ip6table_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(filter_ops); - return ret; + unregister_pernet_subsys(&ip6table_filter_net_ops); + goto err_free; } + return 0; +err_free: + kfree(filter_ops); return ret; } static void __exit ip6table_filter_fini(void) { - unregister_pernet_subsys(&ip6table_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&ip6table_filter_net_ops); kfree(filter_ops); } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 8dd4cd0c47bd..e6ee036a9b2c 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -89,7 +89,7 @@ static int ip6table_mangle_table_init(struct net *net) static void __net_exit ip6table_mangle_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "mangle"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "mangle"); } static void __net_exit ip6table_mangle_net_exit(struct net *net) @@ -104,32 +104,33 @@ static struct pernet_operations ip6table_mangle_net_ops = { static int __init ip6table_mangle_init(void) { - int ret = xt_register_template(&packet_mangler, - ip6table_mangle_table_init); - - if (ret < 0) - return ret; + int ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook); - if (IS_ERR(mangle_ops)) { - xt_unregister_template(&packet_mangler); + if (IS_ERR(mangle_ops)) return PTR_ERR(mangle_ops); - } ret = register_pernet_subsys(&ip6table_mangle_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_mangler, + ip6table_mangle_table_init); if (ret < 0) { - xt_unregister_template(&packet_mangler); - kfree(mangle_ops); - return ret; + unregister_pernet_subsys(&ip6table_mangle_net_ops); + goto err_free; } + return 0; +err_free: + kfree(mangle_ops); return ret; } static void __exit ip6table_mangle_fini(void) { - unregister_pernet_subsys(&ip6table_mangle_net_ops); xt_unregister_template(&packet_mangler); + unregister_pernet_subsys(&ip6table_mangle_net_ops); kfree(mangle_ops); } diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 5be723232df8..c2394e2c94b5 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -121,8 +121,11 @@ static int ip6table_nat_table_init(struct net *net) } ret = ip6t_nat_register_lookups(net); - if (ret < 0) + if (ret < 0) { + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat"); + synchronize_rcu(); ip6t_unregister_table_exit(net, "nat"); + } kfree(repl); return ret; @@ -131,6 +134,7 @@ static int ip6table_nat_table_init(struct net *net) static void __net_exit ip6table_nat_net_pre_exit(struct net *net) { ip6t_nat_unregister_lookups(net); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat"); } static void __net_exit ip6table_nat_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index fc9f6754028f..3b161ee875bc 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -52,7 +52,7 @@ static int ip6table_raw_table_init(struct net *net) static void __net_exit ip6table_raw_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "raw"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "raw"); } static void __net_exit ip6table_raw_net_exit(struct net *net) @@ -75,31 +75,31 @@ static int __init ip6table_raw_init(void) pr_info("Enabling raw table before defrag\n"); } - ret = xt_register_template(table, ip6table_raw_table_init); - if (ret < 0) - return ret; - /* Register hooks */ rawtable_ops = xt_hook_ops_alloc(table, ip6t_do_table); - if (IS_ERR(rawtable_ops)) { - xt_unregister_template(table); + if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); - } ret = register_pernet_subsys(&ip6table_raw_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(table, ip6table_raw_table_init); if (ret < 0) { - kfree(rawtable_ops); - xt_unregister_template(table); - return ret; + unregister_pernet_subsys(&ip6table_raw_net_ops); + goto err_free; } + return 0; +err_free: + kfree(rawtable_ops); return ret; } static void __exit ip6table_raw_fini(void) { - unregister_pernet_subsys(&ip6table_raw_net_ops); xt_unregister_template(&packet_raw); + unregister_pernet_subsys(&ip6table_raw_net_ops); kfree(rawtable_ops); } diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 4df14a9bae78..4bd5d97b8ab6 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -49,7 +49,7 @@ static int ip6table_security_table_init(struct net *net) static void __net_exit ip6table_security_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "security"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "security"); } static void __net_exit ip6table_security_net_exit(struct net *net) @@ -64,32 +64,33 @@ static struct pernet_operations ip6table_security_net_ops = { static int __init ip6table_security_init(void) { - int ret = xt_register_template(&security_table, - ip6table_security_table_init); - - if (ret < 0) - return ret; + int ret; sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table); - if (IS_ERR(sectbl_ops)) { - xt_unregister_template(&security_table); + if (IS_ERR(sectbl_ops)) return PTR_ERR(sectbl_ops); - } ret = register_pernet_subsys(&ip6table_security_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&security_table, + ip6table_security_table_init); if (ret < 0) { - kfree(sectbl_ops); - xt_unregister_template(&security_table); - return ret; + unregister_pernet_subsys(&ip6table_security_net_ops); + goto err_free; } + return 0; +err_free: + kfree(sectbl_ops); return ret; } static void __exit ip6table_security_fini(void) { - unregister_pernet_subsys(&ip6table_security_net_ops); xt_unregister_template(&security_table); + unregister_pernet_subsys(&ip6table_security_net_ops); kfree(sectbl_ops); } diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index ced8bd44828e..893f2aeb4711 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -100,6 +100,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, const struct in6_addr *daddr = NULL, *saddr = NULL; struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var; struct sk_buff *data_skb = NULL; + unsigned short fragoff = 0; int doff = 0; int thoff = 0, tproto; #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -107,8 +108,8 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, struct nf_conn const *ct; #endif - tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); - if (tproto < 0) { + tproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); + if (tproto < 0 || fragoff) { pr_debug("unable to find transport header in IPv6 packet, dropping\n"); return NULL; } diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 8b2dba88ee96..2dbe44715df3 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -160,21 +160,40 @@ static bool nft_fib6_info_nh_dev_match(const struct net_device *nh_dev, l3mdev_master_ifindex_rcu(nh_dev) == dev->ifindex; } +static int nft_fib6_nh_match_dev_cb(struct fib6_nh *nh, void *arg) +{ + const struct net_device *dev = arg; + + return nft_fib6_info_nh_dev_match(nh->fib_nh_dev, dev); +} + static bool nft_fib6_info_nh_uses_dev(struct fib6_info *rt, const struct net_device *dev) { const struct net_device *nh_dev; struct fib6_info *iter; + /* External nexthop: fib6_siblings slot aliases nh_list, walk via nh. */ + if (rt->nh) + return nexthop_for_each_fib6_nh(rt->nh, + nft_fib6_nh_match_dev_cb, + (void *)dev); + nh_dev = fib6_info_nh_dev(rt); if (nft_fib6_info_nh_dev_match(nh_dev, dev)) return true; - list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { + if (!READ_ONCE(rt->fib6_nsiblings)) + return false; + + list_for_each_entry_rcu(iter, &rt->fib6_siblings, fib6_siblings) { nh_dev = fib6_info_nh_dev(iter); if (nft_fib6_info_nh_dev_match(nh_dev, dev)) return true; + + if (!READ_ONCE(rt->fib6_nsiblings)) + return false; } return false; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 19eb6b702227..636f0120d7e3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -481,6 +481,9 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; + if (!READ_ONCE(first->fib6_nsiblings)) + break; + nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); if (hash > nh_upper_bound) continue; @@ -1645,6 +1648,10 @@ static unsigned int fib6_mtu(const struct fib6_result *res) rcu_read_lock(); idev = __in6_dev_get(dev); + if (!idev) { + rcu_read_unlock(); + return 0; + } mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); } @@ -4995,6 +5002,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; + fib6_update_sernum(net, rt); rt6_multipath_rebalance(rt); break; } @@ -5897,6 +5905,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } + if (!READ_ONCE(rt->fib6_nsiblings)) + break; } rcu_read_unlock(); @@ -6928,7 +6938,7 @@ int __init ip6_route_init(void) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) ret = bpf_iter_register(); if (ret) - goto out_register_late_subsys; + goto out_register_notifier; #endif for_each_possible_cpu(cpu) { @@ -6941,6 +6951,10 @@ int __init ip6_route_init(void) out: return ret; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +out_register_notifier: + unregister_netdevice_notifier(&ip6_route_dev_notifier); +#endif out_register_late_subsys: rtnl_unregister_all(PF_INET6); unregister_pernet_subsys(&ip6_route_net_late_ops); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2c3f7a739709..36d75fb50a70 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -288,8 +288,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, saddr = &fl6->saddr; err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); - if (err) + if (err) { + dst_release(dst); goto failure; + } } /* set the source address */ @@ -1617,12 +1619,13 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_cookie_check(sk, skb); + if (!nsk) + return 0; if (nsk != sk) { - if (nsk) { - reason = tcp_child_process(sk, nsk, skb); - if (reason) - goto reset; - } + reason = tcp_child_process(sk, nsk, skb); + sock_put(nsk); + if (reason) + goto reset; return 0; } } else @@ -1827,13 +1830,16 @@ lookup: rst_reason = sk_rst_convert_drop_reason(drop_reason); tcp_v6_send_reset(nsk, skb, rst_reason); + sock_put(nsk); goto discard_and_relse; } + sock_put(nsk); sock_put(sk); return 0; } } + isn = 0; process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ @@ -1863,6 +1869,7 @@ process: th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); + TCP_SKB_CB(skb)->tcp_tw_isn = isn; skb->dev = NULL; @@ -1951,7 +1958,6 @@ do_time_wait: sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; - __this_cpu_write(tcp_tw_isn, isn); goto process; } diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index ea2f805d3b01..9b586fcec485 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -88,8 +88,10 @@ int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6, skb, flags); - if (dst->error) + if (dst->error) { + dst_release(dst); goto drop; + } skb_dst_set(skb, dst); } diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 72dfccd4e3d5..c2dc3338670e 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1540,7 +1540,7 @@ static int iucv_sock_getsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); unsigned int val; - int len; + int len, rc; if (level != SOL_IUCV) return -ENOPROTOOPT; @@ -1553,26 +1553,34 @@ static int iucv_sock_getsockopt(struct socket *sock, int level, int optname, len = min_t(unsigned int, len, sizeof(int)); + rc = 0; + + lock_sock(sk); switch (optname) { case SO_IPRMDATA_MSG: val = (iucv->flags & IUCV_IPRMDATA) ? 1 : 0; break; case SO_MSGLIMIT: - lock_sock(sk); val = (iucv->path != NULL) ? iucv->path->msglim /* connected */ : iucv->msglimit; /* default */ - release_sock(sk); break; case SO_MSGSIZE: - if (sk->sk_state == IUCV_OPEN) - return -EBADFD; + if (sk->sk_state == IUCV_OPEN) { + rc = -EBADFD; + break; + } val = (iucv->hs_dev) ? iucv->hs_dev->mtu - sizeof(struct af_iucv_trans_hdr) - ETH_HLEN : 0x7fffffff; break; default: - return -ENOPROTOOPT; + rc = -ENOPROTOOPT; + break; } + release_sock(sk); + + if (rc) + return rc; if (put_user(len, optlen)) return -EFAULT; diff --git a/net/key/af_key.c b/net/key/af_key.c index a166a88d8788..9cffeef18cd9 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3564,7 +3564,7 @@ static int set_ipsecrequest(struct sk_buff *skb, #ifdef CONFIG_NET_KEY_MIGRATE static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { int i; @@ -3669,7 +3669,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* broadcast migrate message to sockets */ - pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, net); return 0; @@ -3680,7 +3680,7 @@ err: #else static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 157fc23ce4e1..9419c8555d22 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -441,12 +441,13 @@ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { if (tunnel) { list_for_each_entry_rcu(session, &tunnel->session_list, list) { - if (!strcmp(session->ifname, ifname)) { - refcount_inc(&session->ref_count); - rcu_read_unlock_bh(); + if (strcmp(session->ifname, ifname)) + continue; + if (!refcount_inc_not_zero(&session->ref_count)) + continue; + rcu_read_unlock_bh(); - return session; - } + return session; } } } @@ -1360,7 +1361,7 @@ static void l2tp_session_unhash(struct l2tp_session *session) spin_lock_bh(&pn->l2tp_session_idr_lock); /* Remove from the per-tunnel list */ - list_del_init(&session->list); + list_del_rcu(&session->list); /* Remove from per-net IDR */ if (tunnel->version == L2TP_HDR_VER_3) { diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 99d6582f41de..e0b1915be1a6 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1045,64 +1045,76 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, { struct pppol2tp_ioc_stats stats; struct l2tp_session *session; + int err = 0; + + session = pppol2tp_sock_to_session(sock->sk); + /* Validate session presence and magic integrity ONLY for commands + * that belong to L2TP and require a valid session. + */ switch (cmd) { case PPPIOCGMRU: case PPPIOCGFLAGS: - session = sock->sk->sk_user_data; + case PPPIOCSMRU: + case PPPIOCSFLAGS: + case PPPIOCGL2TPSTATS: if (!session) return -ENOTCONN; - if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) + if (session->magic != L2TP_SESSION_MAGIC) { + l2tp_session_put(session); return -EBADF; + } + break; + default: + break; + } + switch (cmd) { + case PPPIOCGMRU: + case PPPIOCGFLAGS: /* Not defined for tunnels */ - if (!session->session_id && !session->peer_session_id) - return -ENOSYS; + if (!session->session_id && !session->peer_session_id) { + err = -ENOSYS; + break; + } - if (put_user(0, (int __user *)arg)) - return -EFAULT; + if (put_user(0, (int __user *)arg)) { + err = -EFAULT; + break; + } break; case PPPIOCSMRU: case PPPIOCSFLAGS: - session = sock->sk->sk_user_data; - if (!session) - return -ENOTCONN; - - if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) - return -EBADF; - /* Not defined for tunnels */ - if (!session->session_id && !session->peer_session_id) - return -ENOSYS; + if (!session->session_id && !session->peer_session_id) { + err = -ENOSYS; + break; + } - if (!access_ok((int __user *)arg, sizeof(int))) - return -EFAULT; + if (!access_ok((int __user *)arg, sizeof(int))) { + err = -EFAULT; + break; + } break; case PPPIOCGL2TPSTATS: - session = sock->sk->sk_user_data; - if (!session) - return -ENOTCONN; - - if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) - return -EBADF; - /* Session 0 represents the parent tunnel */ if (!session->session_id && !session->peer_session_id) { u32 session_id; - int err; if (copy_from_user(&stats, (void __user *)arg, - sizeof(stats))) - return -EFAULT; + sizeof(stats))) { + err = -EFAULT; + break; + } session_id = stats.session_id; err = pppol2tp_tunnel_copy_stats(&stats, session->tunnel); if (err < 0) - return err; + break; stats.session_id = session_id; } else { @@ -1112,15 +1124,21 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, stats.tunnel_id = session->tunnel->tunnel_id; stats.using_ipsec = l2tp_tunnel_uses_xfrm(session->tunnel); - if (copy_to_user((void __user *)arg, &stats, sizeof(stats))) - return -EFAULT; + if (copy_to_user((void __user *)arg, &stats, sizeof(stats))) { + err = -EFAULT; + break; + } break; default: - return -ENOIOCTLCMD; + err = -ENOIOCTLCMD; + break; } - return 0; + if (session) + l2tp_session_put(session); + + return err; } /***************************************************************************** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 7b77d57c9f96..f9ee9947a94d 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2344,8 +2344,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, sta->sta.max_sp = params->max_sp; } - ieee80211_sta_set_max_amsdu_subframes(sta, params->ext_capab, - params->ext_capab_len); + if (params->ext_capab) + ieee80211_sta_set_max_amsdu_subframes(sta, params->ext_capab, + params->ext_capab_len); /* * cfg80211 validates this (1-2007) and allows setting the AID diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 160ae65a5c64..b98ddfa3003e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -438,6 +438,15 @@ ieee80211_verify_sta_ht_mcs_support(struct ieee80211_sub_if_data *sdata, ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); /* + * Some Xfinity XB8 firmware advertises >1 spatial stream MCS indexes in + * their basic HT-MCS set. On cards with lower spatial streams, the check + * would fail, and we'd be stuck with no HT when it in fact work fine with + * its own supported rate. So check it only in strict mode. + */ + if (!ieee80211_hw_check(&sdata->local->hw, STRICT)) + return true; + + /* * P802.11REVme/D7.0 - 6.5.4.2.4 * ... * If the MLME of an HT STA receives an MLME-JOIN.request primitive @@ -8155,6 +8164,7 @@ ieee80211_parse_neg_ttlm(struct ieee80211_sub_if_data *sdata, "No active links for TID %d", tid); return -EINVAL; } + pos += map_size; } else { map = 0; } @@ -8173,7 +8183,6 @@ ieee80211_parse_neg_ttlm(struct ieee80211_sub_if_data *sdata, default: return -EINVAL; } - pos += map_size; } return 0; } @@ -9140,7 +9149,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss *bss = (void *)cbss->priv; struct sta_info *new_sta = NULL; struct ieee80211_link_data *link; - bool have_sta = false; + struct sta_info *have_sta = NULL; bool mlo; int err; u16 new_links; @@ -9159,11 +9168,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, mlo = false; } - if (assoc) { - rcu_read_lock(); + if (assoc) have_sta = sta_info_get(sdata, ap_mld_addr); - rcu_read_unlock(); - } if (mlo && !have_sta && WARN_ON(sdata->vif.valid_links || sdata->vif.active_links)) @@ -9327,6 +9333,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, out_release_chan: ieee80211_link_release_channel(link); out_err: + if (mlo && have_sta) + WARN_ON(__sta_info_destroy(have_sta)); ieee80211_vif_set_links(sdata, 0, 0); return err; } @@ -11224,6 +11232,9 @@ static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata, control = get_unaligned_le16(pos); link_id = control & IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID; + if (link_id >= IEEE80211_MLD_MAX_NUM_LINKS) + continue; + link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index 2b3632c6008a..77894d997113 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -34,6 +34,22 @@ #include "led.h" #include "wep.h" +static const u8 empty_non_inheritance[] = { + WLAN_EID_EXTENSION, 1, WLAN_EID_EXT_NON_INHERITANCE, + /* + * cfg80211_is_element_inherited() hardcodes elements that + * cannot be inherited, so we just need an empty one to be + * calling it at all. + */ +}; + +struct ieee80211_elem_defrag { + const struct element *elem; + /* container start/len */ + const u8 *start; + size_t len; +}; + struct ieee80211_elems_parse { /* must be first for kfree to work */ struct ieee802_11_elems elems; @@ -41,11 +57,7 @@ struct ieee80211_elems_parse { /* The basic Multi-Link element in the original elements */ const struct element *ml_basic_elem; - /* The reconfiguration Multi-Link element in the original elements */ - const struct element *ml_reconf_elem; - - /* The EPCS Multi-Link element in the original elements */ - const struct element *ml_epcs_elem; + struct ieee80211_elem_defrag ml_reconf, ml_epcs; bool multi_link_inner; bool skip_vendor; @@ -162,10 +174,14 @@ ieee80211_parse_extension_element(u32 *crc, } break; case IEEE80211_ML_CONTROL_TYPE_RECONF: - elems_parse->ml_reconf_elem = elem; + elems_parse->ml_reconf.elem = elem; + elems_parse->ml_reconf.start = params->start; + elems_parse->ml_reconf.len = params->len; break; case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: - elems_parse->ml_epcs_elem = elem; + elems_parse->ml_epcs.elem = elem; + elems_parse->ml_epcs.start = params->start; + elems_parse->ml_epcs.len = params->len; break; default: break; @@ -916,7 +932,7 @@ ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse, { struct ieee802_11_elems *elems = &elems_parse->elems; struct ieee80211_mle_per_sta_profile *prof; - const struct element *tmp; + const struct element *tmp, *ret; ssize_t ml_len; const u8 *end; @@ -986,50 +1002,40 @@ ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse, sub->from_ap = params->from_ap; sub->link_id = -1; - return cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - sub->start, sub->len); -} - -static void -ieee80211_mle_defrag_reconf(struct ieee80211_elems_parse *elems_parse) -{ - struct ieee802_11_elems *elems = &elems_parse->elems; - ssize_t ml_len; + ret = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + sub->start, sub->len); + if (ret) + return ret; - ml_len = cfg80211_defragment_element(elems_parse->ml_reconf_elem, - elems->ie_start, - elems->total_len, - elems_parse->scratch_pos, - elems_parse->scratch + - elems_parse->scratch_len - - elems_parse->scratch_pos, - WLAN_EID_FRAGMENT); - if (ml_len < 0) - return; - elems->ml_reconf = (void *)elems_parse->scratch_pos; - elems->ml_reconf_len = ml_len; - elems_parse->scratch_pos += ml_len; + /* + * Since we know we want and found a profile, apply an empty + * non-inheritance if the profile didn't have one, so that any + * element that shouldn't be inherited by spec isn't. + */ + return (const void *)empty_non_inheritance; } -static void -ieee80211_mle_defrag_epcs(struct ieee80211_elems_parse *elems_parse) +static const void * +ieee80211_mle_defrag(struct ieee80211_elems_parse *elems_parse, + struct ieee80211_elem_defrag *defrag, + size_t *out_len) { - struct ieee802_11_elems *elems = &elems_parse->elems; + const void *ret; ssize_t ml_len; - ml_len = cfg80211_defragment_element(elems_parse->ml_epcs_elem, - elems->ie_start, - elems->total_len, + ml_len = cfg80211_defragment_element(defrag->elem, + defrag->start, defrag->len, elems_parse->scratch_pos, elems_parse->scratch + elems_parse->scratch_len - elems_parse->scratch_pos, WLAN_EID_FRAGMENT); if (ml_len < 0) - return; - elems->ml_epcs = (void *)elems_parse->scratch_pos; - elems->ml_epcs_len = ml_len; + return NULL; + ret = elems_parse->scratch_pos; + *out_len = ml_len; elems_parse->scratch_pos += ml_len; + return ret; } struct ieee802_11_elems * @@ -1042,6 +1048,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) size_t scratch_len = 3 * params->len; bool multi_link_inner = false; + BUILD_BUG_ON(sizeof(empty_non_inheritance) != empty_non_inheritance[1] + 2); BUILD_BUG_ON(offsetof(typeof(*elems_parse), elems) != 0); /* cannot parse for both a specific link and non-transmitted BSS */ @@ -1089,6 +1096,17 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, sub.start, nontx_len); + /* + * If it's a non-transmitted BSS, we shouldn't pick + * any elements in the outer parsing that shouldn't + * be inherited. If the profile has a non-inheritance + * element this automatically happens, but if not then + * provide an empty one so that the hard-coded elements + * in cfg80211_is_element_inherited() are ignored, but + * it must be called. + */ + if (params->bss->transmitted_bss && !non_inherit) + non_inherit = (const void *)empty_non_inheritance; } else { /* must always parse to get elems_parse->ml_basic_elem */ non_inherit = ieee80211_prep_mle_link_parse(elems_parse, params, @@ -1109,9 +1127,12 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) _ieee802_11_parse_elems_full(&sub, elems_parse, NULL); } - ieee80211_mle_defrag_reconf(elems_parse); - - ieee80211_mle_defrag_epcs(elems_parse); + elems->ml_reconf = ieee80211_mle_defrag(elems_parse, + &elems_parse->ml_reconf, + &elems->ml_reconf_len); + elems->ml_epcs = ieee80211_mle_defrag(elems_parse, + &elems_parse->ml_epcs, + &elems->ml_epcs_len); if (elems->tim && !elems->parse_error) { const struct ieee80211_tim_ie *tim_ie = elems->tim; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3e5d1c47a5b0..3fb40449c6c5 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4971,7 +4971,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, struct sk_buff *skb = rx->skb; struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - static ieee80211_rx_result res; + ieee80211_rx_result res; int orig_len = skb->len; int hdrlen = ieee80211_hdrlen(hdr->frame_control); int snap_offs = hdrlen; @@ -4984,6 +4984,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, u8 sa[ETH_ALEN]; } addrs __aligned(2); struct ieee80211_sta_rx_stats *stats; + u32 encoded_rate; /* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write * to a common data structure; drivers can implement that per queue @@ -5091,11 +5092,14 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, /* push the addresses in front */ memcpy(skb_push(skb, sizeof(addrs)), &addrs, sizeof(addrs)); + /* capture before mesh forward may memset or free skb->cb */ + encoded_rate = sta_stats_encode_rate(status); + res = ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb); switch (res) { case RX_QUEUED: stats->last_rx = jiffies; - stats->last_rate = sta_stats_encode_rate(status); + stats->last_rate = encoded_rate; return true; case RX_CONTINUE: break; @@ -5380,7 +5384,9 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (!link_sta) goto out; - ieee80211_rx_data_set_link(&rx, link_sta->link_id); + if (!ieee80211_rx_data_set_link(&rx, + link_sta->link_id)) + goto out; } if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) diff --git a/net/mac80211/tests/chan-mode.c b/net/mac80211/tests/chan-mode.c index adc069065e73..fa370831d617 100644 --- a/net/mac80211/tests/chan-mode.c +++ b/net/mac80211/tests/chan-mode.c @@ -65,6 +65,7 @@ static const struct determine_chan_mode_case { .ht_capa_mask = { .mcs.rx_mask[0] = 0xf7, }, + .strict = true, }, { .desc = "Masking out a RX rate in VHT capabilities", .conn_mode = IEEE80211_CONN_MODE_EHT, diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index b487d2330f25..ea7f63e1fc17 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2181,7 +2181,9 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, case IEEE80211_RADIOTAP_ANTENNA: /* this can appear multiple times, keep a bitmap */ - info->control.antennas |= BIT(*iterator.this_arg); + /* control.antennas is only a 2-bit bitmap */ + if (*iterator.this_arg < 2) + info->control.antennas |= BIT(*iterator.this_arg); break; case IEEE80211_RADIOTAP_DATA_RETRIES: diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b093bc203c81..2529b01e2cd5 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3700,11 +3700,11 @@ void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, struct ieee80211_local *local = container_of(work, struct ieee80211_local, radar_detected_work); struct cfg80211_chan_def chandef; - struct ieee80211_chanctx *ctx; + struct ieee80211_chanctx *ctx, *tmp; lockdep_assert_wiphy(local->hw.wiphy); - list_for_each_entry(ctx, &local->chanctx_list, list) { + list_for_each_entry_safe(ctx, tmp, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) continue; diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index e1033643fab0..e4b230ef6099 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -920,9 +920,9 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) static void mctp_test_route_input_null_eid(struct kunit *test) { struct mctp_hdr hdr = RX_HDR(1, 10, 0, FL_S | FL_E | FL_TO); + struct sockaddr_mctp addr = { 0 }; struct sk_buff *skb_pkt, *skb_sk; struct mctp_test_dev *dev; - struct sockaddr_mctp addr; struct socket *sock; u8 type = 0; int rc; diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index c3987d5ade7a..6eef8d485c25 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -116,7 +116,7 @@ void mctp_test_destroy_dev(struct mctp_test_dev *dev) static int mctp_test_dst_output(struct mctp_dst *dst, struct sk_buff *skb) { skb->dev = dst->dev->dev; - dev_queue_xmit(skb); + dev_direct_xmit(skb, 0); return 0; } diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c index 8a16672b94e2..4cc16cbeb328 100644 --- a/net/mptcp/bpf.c +++ b/net/mptcp/bpf.c @@ -14,7 +14,7 @@ struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { - if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk)) + if (sk && sk_fullsock(sk) && sk_is_tcp(sk) && sk_is_mptcp(sk)) return mptcp_sk(mptcp_subflow_ctx(sk)->conn); return NULL; diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index 82ec15bcfd7f..082c46c0f50e 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -12,6 +12,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf struct sock *sk, *ssk; struct sk_buff *skb; struct tcp_sock *tp; + bool has_rxtstamp; /* on early fallback the subflow context is deleted by * subflow_syn_recv_sock() @@ -40,12 +41,13 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf */ tp->copied_seq += skb->len; subflow->ssn_offset += skb->len; + has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* Only the sequence delta is relevant */ MPTCP_SKB_CB(skb)->map_seq = -skb->len; MPTCP_SKB_CB(skb)->end_seq = 0; MPTCP_SKB_CB(skb)->offset = 0; - MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; MPTCP_SKB_CB(skb)->cant_coalesce = 1; mptcp_data_lock(sk); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8a1c5698983c..b3ea7854818f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -566,12 +566,17 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct tcp_sock *tp = tcp_sk(sk); unsigned int dss_size = 0; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; - u64 ack_seq; + /* Zero `use_ack` and `use_map` flags with one shot. */ + BUILD_BUG_ON(sizeof_field(struct mptcp_ext, flags) != sizeof(u16)); + BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_ext, flags), + sizeof(u16))); + *(u16 *)&opts->ext_copy.flags = 0; opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; @@ -595,20 +600,16 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, /* passive sockets msk will set the 'can_ack' after accept(), even * if the first subflow may have the already the remote key handy */ - opts->ext_copy.use_ack = 0; if (!READ_ONCE(msk->can_ack)) { *size = ALIGN(dss_size, 4); return ret; } - ack_seq = READ_ONCE(msk->ack_seq); if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; - opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; } else { ack_size = TCPOLEN_MPTCP_DSS_ACK32; - opts->ext_copy.data_ack32 = (uint32_t)ack_seq; opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; @@ -618,6 +619,12 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, if (dss_size == 0) ack_size += TCPOLEN_MPTCP_DSS_BASE; + /* The caller is __tcp_transmit_skb(), and will compute the new rcv + * wnd soon: ensure that the window can shrink. + */ + if (skb) + tp->rcv_wnd = tp->rcv_nxt - tp->rcv_wup; + dss_size += ack_size; *size = ALIGN(dss_size, 4); @@ -658,7 +665,6 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - bool drop_other_suboptions = false; unsigned int opt_size = *size; struct mptcp_addr_info addr; bool echo; @@ -669,36 +675,20 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || - !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr, - &echo, &drop_other_suboptions)) + !skb || !skb_is_tcp_pure_ack(skb) || + !mptcp_pm_add_addr_signal(msk, opt_size, remaining, &addr, &echo)) return false; - /* - * Later on, mptcp_write_options() will enforce mutually exclusion with - * DSS, bail out if such option is set and we can't drop it. - */ - if (drop_other_suboptions) - remaining += opt_size; - else if (opts->suboptions & OPTION_MPTCP_DSS) - return false; + remaining += opt_size; len = mptcp_add_addr_len(addr.family, echo, !!addr.port); if (remaining < len) return false; *size = len; - if (drop_other_suboptions) { - pr_debug("drop other suboptions\n"); - opts->suboptions = 0; - - /* note that e.g. DSS could have written into the memory - * aliased by ahmac, we must reset the field here - * to avoid appending the hmac even for ADD_ADDR echo - * options - */ - opts->ahmac = 0; - *size -= opt_size; - } + pr_debug("drop other suboptions\n"); + opts->suboptions = 0; + *size -= opt_size; opts->addr = addr; opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { @@ -708,6 +698,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * &opts->addr); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); + opts->ahmac = 0; } pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d\n", opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); @@ -1297,19 +1288,14 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) return true; } -static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) +static u64 mptcp_set_rwin(struct mptcp_sock *msk, struct tcp_sock *tp, + struct tcphdr *th, u64 ack_seq) { const struct sock *ssk = (const struct sock *)tp; - struct mptcp_subflow_context *subflow; - u64 ack_seq, rcv_wnd_old, rcv_wnd_new; - struct mptcp_sock *msk; + u64 rcv_wnd_old, rcv_wnd_new; u32 new_win; u64 win; - subflow = mptcp_subflow_ctx(ssk); - msk = mptcp_sk(subflow->conn); - - ack_seq = READ_ONCE(msk->ack_seq); rcv_wnd_new = ack_seq + tp->rcv_wnd; rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent); @@ -1362,7 +1348,7 @@ raise_win: update_wspace: WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); - subflow->rcv_wnd_sent = rcv_wnd_new; + return rcv_wnd_new; } static void mptcp_track_rwin(struct tcp_sock *tp) @@ -1474,13 +1460,25 @@ void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { + struct mptcp_sock *msk; + u64 ack_seq; + + /* DSS option is set only by mptcp_established_options, + * the caller is __tcp_transmit_skb() and ssk is always + * not NULL. + */ + subflow = mptcp_subflow_ctx(ssk); + msk = mptcp_sk(subflow->conn); + ack_seq = READ_ONCE(msk->ack_seq); if (mpext->ack64) { - put_unaligned_be64(mpext->data_ack, ptr); + put_unaligned_be64(ack_seq, ptr); ptr += 2; } else { - put_unaligned_be32(mpext->data_ack32, ptr); + put_unaligned_be32(ack_seq, ptr); ptr += 1; } + subflow->rcv_wnd_sent = mptcp_set_rwin(msk, tp, th, + ack_seq); } if (mpext->use_map) { @@ -1708,9 +1706,6 @@ mp_capable_done: i += 4; } } - - if (tp) - mptcp_set_rwin(tp, th); } __be32 mptcp_get_reset_option(const struct sk_buff *skb) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 57a456690406..470501470fe5 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -16,6 +16,7 @@ struct mptcp_pm_add_entry { struct list_head list; struct mptcp_addr_info addr; u8 retrans_times; + bool timer_done; struct timer_list add_timer; struct mptcp_sock *sock; struct rcu_head rcu; @@ -283,6 +284,9 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct mptcp_addr_info local, remote; + if (!__mptcp_subflow_active(subflow)) + continue; + mptcp_local_address((struct sock_common *)ssk, &local); if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; @@ -305,18 +309,31 @@ static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk) const struct net *net = sock_net((struct sock *)msk); unsigned int rto = mptcp_get_add_addr_timeout(net); struct mptcp_subflow_context *subflow; - unsigned int max = 0; + unsigned int max = 0, max_stale = 0; + + if (!rto) + return 0; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct inet_connection_sock *icsk = inet_csk(ssk); - if (icsk->icsk_rto > max) + if (!__mptcp_subflow_active(subflow)) + continue; + + if (unlikely(subflow->stale)) { + if (icsk->icsk_rto > max_stale) + max_stale = icsk->icsk_rto; + } else if (icsk->icsk_rto > max) { max = icsk->icsk_rto; + } } - if (max && max < rto) - rto = max; + if (max) + return min(max, rto); + + if (max_stale) + return min(max_stale, rto); return rto; } @@ -327,31 +344,33 @@ static void mptcp_pm_add_timer(struct timer_list *timer) add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; - unsigned int timeout; + unsigned int timeout = 0; pr_debug("msk=%p\n", msk); - if (!msk) - return; - - if (inet_sk_state_load(sk) == TCP_CLOSE) - return; - - if (!entry->addr.id) - return; + bh_lock_sock(sk); + if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) + goto out; - if (mptcp_pm_should_add_signal_addr(msk)) { - sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + timeout = HZ / 20; goto out; } timeout = mptcp_adjust_add_addr_timeout(msk); - if (!timeout) + if (!timeout || mptcp_pm_should_add_signal_addr(msk)) goto out; spin_lock_bh(&msk->pm.lock); - if (!mptcp_pm_should_add_signal_addr(msk)) { + /* The cancel path (mptcp_pm_del_add_timer()) can race with this + * callback. Once cancel updates retrans_times to MAX, suppress further + * retransmissions here. If this callback acquires pm.lock first, one + * final transmit attempt is still possible. + */ + if (entry->retrans_times < ADD_ADDR_RETRANS_MAX && + !mptcp_pm_should_add_signal_addr(msk)) { pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); @@ -359,8 +378,9 @@ static void mptcp_pm_add_timer(struct timer_list *timer) } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) - sk_reset_timer(sk, timer, - jiffies + (timeout << entry->retrans_times)); + timeout <<= entry->retrans_times; + else + timeout = 0; spin_unlock_bh(&msk->pm.lock); @@ -368,7 +388,13 @@ static void mptcp_pm_add_timer(struct timer_list *timer) mptcp_pm_subflow_established(msk); out: - __sock_put(sk); + if (timeout) + sk_reset_timer(sk, timer, jiffies + timeout); + else + /* if sock_put calls sk_free: avoid waiting for this timer */ + entry->timer_done = true; + bh_unlock_sock(sk); + sock_put(sk); } struct mptcp_pm_add_entry * @@ -394,8 +420,12 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, /* Note: entry might have been removed by another thread. * We hold rcu_read_lock() to ensure it is not freed under us. */ - if (stop_timer) - sk_stop_timer_sync(sk, &entry->add_timer); + if (stop_timer) { + if (check_id) + sk_stop_timer(sk, &entry->add_timer); + else + sk_stop_timer_sync(sk, &entry->add_timer); + } rcu_read_unlock(); return entry; @@ -431,6 +461,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); reset_timer: + add_entry->timer_done = false; timeout = mptcp_adjust_add_addr_timeout(msk); if (timeout) sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout); @@ -451,7 +482,8 @@ static void mptcp_pm_free_anno_list(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); list_for_each_entry_safe(entry, tmp, &free_list, list) { - sk_stop_timer_sync(sk, &entry->add_timer); + if (!entry->timer_done) + sk_stop_timer_sync(sk, &entry->add_timer); kfree_rcu(entry, rcu); } } @@ -855,11 +887,11 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) } } -bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, - unsigned int opt_size, unsigned int remaining, - struct mptcp_addr_info *addr, bool *echo, - bool *drop_other_suboptions) +bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int opt_size, + unsigned int remaining, + struct mptcp_addr_info *addr, bool *echo) { + bool skip_add_addr = false; int ret = false; u8 add_addr; u8 family; @@ -875,30 +907,49 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, * plain dup-ack from TCP perspective. The other MPTCP-relevant info, * if any, will be carried by the 'original' TCP ack */ - if (skb && skb_is_tcp_pure_ack(skb)) { - remaining += opt_size; - *drop_other_suboptions = true; - } + remaining += opt_size; *echo = mptcp_pm_should_add_signal_echo(msk); - port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); - - family = *echo ? msk->pm.remote.family : msk->pm.local.family; - if (remaining < mptcp_add_addr_len(family, *echo, port)) - goto out_unlock; - if (*echo) { *addr = msk->pm.remote; add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO); + port = !!msk->pm.remote.port; + family = msk->pm.remote.family; } else { *addr = msk->pm.local; add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL); + port = !!msk->pm.local.port; + family = msk->pm.local.family; } - WRITE_ONCE(msk->pm.addr_signal, add_addr); + + if (remaining < mptcp_add_addr_len(family, *echo, port)) { + struct net *net = sock_net((struct sock *)msk); + + if (*echo) { + MPTCP_INC_STATS(net, MPTCP_MIB_ECHOADDTXDROP); + } else { + skip_add_addr = true; + MPTCP_INC_STATS(net, MPTCP_MIB_ADDADDRTXDROP); + } + goto drop_signal_mark; + } + ret = true; +drop_signal_mark: + WRITE_ONCE(msk->pm.addr_signal, add_addr); + out_unlock: spin_unlock_bh(&msk->pm.lock); + + /* On pure-ACK option-space exhaustion, stop retrying this ADD_ADDR: + * clear the signal bit, cancel the matching retransmission timer, and + * let the PM state machine progress. + */ + if (skip_add_addr) { + mptcp_pm_del_add_timer(msk, addr, true); + mptcp_pm_subflow_established(msk); + } return ret; } diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index c9f1e5af3cd3..fc818b63752e 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -347,6 +347,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) /* check first for announce */ if (msk->pm.add_addr_signaled < endp_signal_max) { + u8 endp_id; + /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the @@ -360,19 +362,20 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (!select_signal_address(pernet, msk, &local)) goto subflow; + /* Special case for ID0: set the correct ID */ + endp_id = local.addr.id; + if (endp_id == msk->mpc_endpoint_id) + local.addr.id = 0; + /* If the alloc fails, we are on memory pressure, not worth * continuing, and trying to create subflows. */ if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) return; - __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + __clear_bit(endp_id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; - /* Special case for ID0: set the correct ID */ - if (local.addr.id == msk->mpc_endpoint_id) - local.addr.id = 0; - mptcp_pm_announce_addr(msk, &local.addr, false); mptcp_pm_addr_send_ack(msk); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 8cbc1920afb4..0d3a95e676f1 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -408,19 +408,21 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) local.flags = entry.flags; local.ifindex = entry.ifindex; + spin_lock_bh(&msk->pm.lock); + msk->pm.extra_subflows++; + spin_unlock_bh(&msk->pm.lock); + lock_sock(sk); err = __mptcp_subflow_connect(sk, &local, &addr_r); release_sock(sk); - if (err) + if (err) { GENL_SET_ERR_MSG_FMT(info, "connect error: %d", err); - spin_lock_bh(&msk->pm.lock); - if (err) + spin_lock_bh(&msk->pm.lock); mptcp_userspace_pm_delete_local_addr(msk, &entry); - else - msk->pm.extra_subflows++; - spin_unlock_bh(&msk->pm.lock); + spin_unlock_bh(&msk->pm.lock); + } create_err: sock_put(sk); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4546a8b09884..cb9515f505aa 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -397,12 +397,26 @@ static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb) return false; } - /* old data, keep it simple and drop the whole pkt, sender - * will retransmit as needed, if needed. + /* Completely old data? */ + if (!after64(MPTCP_SKB_CB(skb)->end_seq, msk->ack_seq)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); + mptcp_drop(sk, skb); + return false; + } + + /* Partial packet: map_seq < ack_seq < end_seq. + * Skip the already-acked bytes and enqueue the new data. */ - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); - mptcp_drop(sk, skb); - return false; + copy_len = MPTCP_SKB_CB(skb)->end_seq - msk->ack_seq; + MPTCP_SKB_CB(skb)->offset += msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; + MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq - + MPTCP_SKB_CB(skb)->map_seq; + msk->bytes_received += copy_len; + WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); + + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); + return true; } static void mptcp_stop_rtx_timer(struct sock *sk) @@ -2262,6 +2276,10 @@ static bool mptcp_move_skbs(struct sock *sk) mptcp_backlog_spooled(sk, moved, &skbs); } mptcp_data_unlock(sk); + + if (enqueued && mptcp_epollin_ready(sk)) + sk->sk_data_ready(sk); + return enqueued; } @@ -2851,6 +2869,10 @@ static void __mptcp_retrans(struct sock *sk) msk->bytes_retrans += len; dfrag->already_sent = max(dfrag->already_sent, len); + /* With csum enabled retransmission can send new data. */ + if (after64(dfrag->already_sent + dfrag->data_seq, msk->snd_nxt)) + WRITE_ONCE(msk->snd_nxt, dfrag->already_sent + dfrag->data_seq); + reset_timer: mptcp_check_and_set_pending(sk); @@ -3473,6 +3495,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) /* for fallback's sake */ WRITE_ONCE(msk->ack_seq, 0); + atomic64_set(&msk->rcv_wnd_sent, 0); WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); @@ -4405,6 +4428,8 @@ static int __mptcp_read_sock(struct sock *sk, read_descriptor_t *desc, } mptcp_eat_recv_skb(sk, skb); + if (!desc->count) + break; } if (noack) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e4f5aba24da7..b93b878478d2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1229,10 +1229,9 @@ static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; } -bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, - unsigned int opt_size, unsigned int remaining, - struct mptcp_addr_info *addr, bool *echo, - bool *drop_other_suboptions); +bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int opt_size, + unsigned int remaining, + struct mptcp_addr_info *addr, bool *echo); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 0efe40be2fde..fcf6feb2a9eb 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -67,6 +67,12 @@ static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval, return 0; } +static void __mptcp_subflow_set_rcvbuf(struct sock *ssk, int val) +{ + WRITE_ONCE(ssk->sk_rcvbuf, val); + tcp_set_rcvbuf(ssk, val); +} + static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val) { struct mptcp_subflow_context *subflow; @@ -100,7 +106,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in case SO_RCVBUF: case SO_RCVBUFFORCE: ssk->sk_userlocks |= SOCK_RCVBUF_LOCK; - WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); + __mptcp_subflow_set_rcvbuf(ssk, sk->sk_rcvbuf); break; case SO_MARK: if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) { @@ -235,15 +241,19 @@ static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk, mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int err; lock_sock(ssk); - sock_set_timestamping(ssk, optname, timestamping); + err = sock_set_timestamping(ssk, optname, timestamping); release_sock(ssk); + + if (err < 0 && ret == 0) + ret = err; } release_sock(sk); - return 0; + return ret; } static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval, @@ -807,11 +817,16 @@ static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level, mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int err; - ret = tcp_setsockopt(ssk, level, optname, optval, optlen); - if (ret) - break; + err = tcp_setsockopt(ssk, level, optname, optval, optlen); + if (err < 0 && ret == 0) + ret = err; } + + if (!ret) + sockopt_seq_inc(msk); + return ret; } @@ -1556,7 +1571,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf; } if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) - WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); + __mptcp_subflow_set_rcvbuf(ssk, sk->sk_rcvbuf); } if (sock_flag(sk, SOCK_LINGER)) { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e2cb9d23e4a0..d562e149606f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -581,7 +581,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->backup); if (!subflow_thmac_valid(subflow)) { - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC); subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } @@ -908,7 +908,7 @@ create_child: if (!subflow_hmac_valid(subflow_req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); - subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index c5a26236a0bb..3706b4a85a0f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1613,6 +1613,7 @@ dump_last: ((dump_type == DUMP_ALL) == !!(set->type->features & IPSET_DUMP_LAST))) { write_unlock_bh(&ip_set_ref_lock); + set = NULL; continue; } pr_debug("List set: %s\n", set->name); @@ -1648,13 +1649,13 @@ dump_last: if (cb->args[IPSET_CB_PROTO] > IPSET_PROTOCOL_MIN && nla_put_net16(skb, IPSET_ATTR_INDEX, htons(index))) goto nla_put_failure; + if (set->variant->uref) + set->variant->uref(set, cb, true); ret = set->variant->head(set, skb); if (ret < 0) goto release_refcount; if (dump_flags & IPSET_FLAG_LIST_HEADER) goto next_set; - if (set->variant->uref) - set->variant->uref(set, cb, true); fallthrough; default: ret = set->variant->list(set, skb, cb); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index b79e5dd2af03..04e4627ddfc1 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -386,8 +386,9 @@ static void mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) { int i; + u8 pos = smp_load_acquire(&n->pos); - for (i = 0; i < n->pos; i++) + for (i = 0; i < pos; i++) if (test_bit(i, n->used)) ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); } @@ -490,7 +491,7 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif - u8 htable_bits = t->htable_bits; + u8 pos, htable_bits = t->htable_bits; spin_lock_bh(&t->hregion[r].lock); for (i = ahash_bucket_start(r, htable_bits); @@ -498,7 +499,8 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) n = __ipset_dereference(hbucket(t, i)); if (!n) continue; - for (j = 0, d = 0; j < n->pos; j++) { + pos = smp_load_acquire(&n->pos); + for (j = 0, d = 0; j < pos; j++) { if (!test_bit(j, n->used)) { d++; continue; @@ -534,7 +536,7 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) /* Still try to delete expired elements. */ continue; tmp->size = n->size - AHASH_INIT_SIZE; - for (j = 0, d = 0; j < n->pos; j++) { + for (j = 0, d = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); @@ -623,7 +625,7 @@ mtype_resize(struct ip_set *set, bool retried) { struct htype *h = set->data; struct htable *t, *orig; - u8 htable_bits; + u8 pos, htable_bits; size_t hsize, dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; @@ -685,7 +687,8 @@ retry: n = __ipset_dereference(hbucket(orig, i)); if (!n) continue; - for (j = 0; j < n->pos; j++) { + pos = smp_load_acquire(&n->pos); + for (j = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); @@ -809,9 +812,10 @@ mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) { struct htype *h = set->data; const struct htable *t; - u32 i, j, r; struct hbucket *n; struct mtype_elem *data; + u32 i, j, r; + u8 pos; t = rcu_dereference_bh(h->table); for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { @@ -820,7 +824,8 @@ mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) n = rcu_dereference_bh(hbucket(t, i)); if (!n) continue; - for (j = 0; j < n->pos; j++) { + pos = smp_load_acquire(&n->pos); + for (j = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, set->dsize); @@ -848,6 +853,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, bool flag_exist = flags & IPSET_FLAG_EXIST; bool deleted = false, forceadd = false, reuse = false; u32 r, key, multi = 0, elements, maxelem; + u8 npos = 0; rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); @@ -889,7 +895,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ext_size(AHASH_INIT_SIZE, set->dsize); goto copy_elem; } - for (i = 0; i < n->pos; i++) { + npos = smp_load_acquire(&n->pos); + for (i = 0; i < npos; i++) { if (!test_bit(i, n->used)) { /* Reuse first deleted entry */ if (j == -1) { @@ -933,7 +940,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (elements >= maxelem) goto set_full; /* Create a new slot */ - if (n->pos >= n->size) { + if (npos >= n->size) { #ifdef IP_SET_HASH_WITH_MULTI if (h->bucketsize >= AHASH_MAX_TUNED) goto set_full; @@ -962,7 +969,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, } copy_elem: - j = n->pos++; + j = npos++; data = ahash_data(n, j, set->dsize); copy_data: t->hregion[r].elements++; @@ -985,6 +992,8 @@ overwrite_extensions: if (SET_WITH_TIMEOUT(set)) ip_set_timeout_set(ext_timeout(data, set), ext->timeout); smp_mb__before_atomic(); + /* Ensure all data writes are visible before updating position */ + smp_store_release(&n->pos, npos); set_bit(j, n->used); if (old != ERR_PTR(-ENOENT)) { rcu_assign_pointer(hbucket(t, key), n); @@ -1043,6 +1052,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, int i, j, k, r, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; size_t dsize = set->dsize; + u8 pos; /* Userspace add and resize is excluded by the mutex. * Kernespace add does not trigger resize. @@ -1058,7 +1068,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, n = rcu_dereference_bh(hbucket(t, key)); if (!n) goto out; - for (i = 0, k = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0, k = 0; i < pos; i++) { if (!test_bit(i, n->used)) { k++; continue; @@ -1072,8 +1083,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, ret = 0; clear_bit(i, n->used); smp_mb__after_atomic(); - if (i + 1 == n->pos) - n->pos--; + if (i + 1 == pos) + smp_store_release(&n->pos, --pos); t->hregion[r].elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) @@ -1094,11 +1105,11 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, x->flags = flags; } } - for (; i < n->pos; i++) { + for (; i < pos; i++) { if (!test_bit(i, n->used)) k++; } - if (k == n->pos) { + if (k == pos) { t->hregion[r].ext_size -= ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, key), NULL); kfree_rcu(n, rcu); @@ -1109,7 +1120,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (!tmp) goto out; tmp->size = n->size - AHASH_INIT_SIZE; - for (j = 0, k = 0; j < n->pos; j++) { + for (j = 0, k = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); @@ -1170,6 +1181,7 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, int ret, i, j = 0; #endif u32 key, multi = 0; + u8 pos; pr_debug("test by nets\n"); for (; j < NLEN && h->nets[j].cidr[0] && !multi; j++) { @@ -1187,7 +1199,8 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, n = rcu_dereference_bh(hbucket(t, key)); if (!n) continue; - for (i = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0; i < pos; i++) { if (!test_bit(i, n->used)) continue; data = ahash_data(n, i, set->dsize); @@ -1221,6 +1234,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct mtype_elem *data; int i, ret = 0; u32 key, multi = 0; + u8 pos; rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); @@ -1243,7 +1257,8 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, ret = 0; goto out; } - for (i = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0; i < pos; i++) { if (!test_bit(i, n->used)) continue; data = ahash_data(n, i, set->dsize); @@ -1360,6 +1375,7 @@ mtype_list(const struct ip_set *set, /* We assume that one hash bucket fills into one page */ void *incomplete; int i, ret = 0; + u8 pos; atd = nla_nest_start(skb, IPSET_ATTR_ADT); if (!atd) @@ -1378,7 +1394,8 @@ mtype_list(const struct ip_set *set, cb->args[IPSET_CB_ARG0], t, n); if (!n) continue; - for (i = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0; i < pos; i++) { if (!test_bit(i, n->used)) continue; e = ahash_data(n, i, set->dsize); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index a22ec1a6f6ec..e26ca2a370e3 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -150,7 +150,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to; ip++, i++) { + for (; ip <= ip_to; i++) { e.ip = htonl(ip); if (i > IPSET_MAX_RANGE) { hash_ipmark4_data_next(&h->next, &e); @@ -162,6 +162,10 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; ret = 0; + + if (ip == ip_to) + break; + ip++; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index e977b5a9c48d..41ca24a22a02 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -186,7 +186,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to; ip++) { + for (; ip <= ip_to;) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { @@ -203,6 +203,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], ret = 0; } + if (ip == ip_to) + break; + ip++; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 39a01934b153..b9ac2efaa15c 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -182,7 +182,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; ip <= ip_to; ip++) { + for (; ip <= ip_to;) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { @@ -199,6 +199,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], ret = 0; } + if (ip == ip_to) + break; + ip++; } return ret; } diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 5c6de605a9fb..2d6652d43199 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -274,7 +274,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], p = port; ip2 = ip2_from; } - for (; ip <= ip_to; ip++) { + for (; ip <= ip_to;) { e.ip = htonl(ip); for (; p <= port_to; p++) { e.port = htons(p); @@ -298,6 +298,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip2 = ip2_from; } p = port; + if (ip == ip_to) + break; + ip++; } return ret; } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 2082bfb2d93c..9ea6b4fa78bf 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -267,27 +267,20 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) hash_key2 = hash_key; use2 = false; } + conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */, &head, &head2); - spin_lock(&cp->lock); - - if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - cp->flags |= IP_VS_CONN_F_HASHED; - WRITE_ONCE(cp->hn0.hash_key, hash_key); - WRITE_ONCE(cp->hn1.hash_key, hash_key2); - refcount_inc(&cp->refcnt); - hlist_bl_add_head_rcu(&cp->hn0.node, head); - if (use2) - hlist_bl_add_head_rcu(&cp->hn1.node, head2); - ret = 1; - } else { - pr_err("%s(): request for already hashed, called from %pS\n", - __func__, __builtin_return_address(0)); - ret = 0; - } - spin_unlock(&cp->lock); + cp->flags |= IP_VS_CONN_F_HASHED; + WRITE_ONCE(cp->hn0.hash_key, hash_key); + WRITE_ONCE(cp->hn1.hash_key, hash_key2); + refcount_inc(&cp->refcnt); + hlist_bl_add_head_rcu(&cp->hn0.node, head); + if (use2) + hlist_bl_add_head_rcu(&cp->hn1.node, head2); + conn_tab_unlock(head, head2); + ret = 1; /* Schedule resizing if load increases */ if (atomic_read(&ipvs->conn_count) > t->u_thresh && @@ -321,7 +314,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */, &head, &head2); - spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { /* Decrease refcnt and unlink conn only if we are last user */ @@ -334,7 +326,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) } } - spin_unlock(&cp->lock); conn_tab_unlock(head, head2); rcu_read_unlock(); @@ -637,6 +628,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) struct ip_vs_conn_hnode *hn; u32 hash_key, hash_key_new; struct ip_vs_conn_param p; + bool by_me = false; int ntbl; int dir; @@ -664,8 +656,16 @@ retry: t = rcu_dereference(t->new_tbl); ntbl++; /* We are lost? */ - if (ntbl >= 2) + if (ntbl >= 2) { + spin_lock_bh(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT && by_me) + cp->cport = 0; + /* hn1 will be rehashed on next packet */ + spin_unlock_bh(&cp->lock); + IP_VS_ERR_RL("%s(): Too many ht changes for dir %d\n", + __func__, dir); return; + } } /* Rehashing during resize? Use the recent table for adds */ @@ -683,10 +683,13 @@ retry: if (head > head2 && t == t2) swap(head, head2); + /* Protect the cp->flags modification */ + spin_lock_bh(&cp->lock); + /* Lock seqcount only for the old bucket, even if we are on new table * because it affects the del operation, not the adding. */ - spin_lock_bh(&t->lock[hash_key & t->lock_mask].l); + spin_lock(&t->lock[hash_key & t->lock_mask].l); preempt_disable_nested(); write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]); @@ -704,14 +707,23 @@ retry: hlist_bl_unlock(head); write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); preempt_enable_nested(); - spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); + spin_unlock(&t->lock[hash_key & t->lock_mask].l); + spin_unlock_bh(&cp->lock); hash_key = hash_key_new; goto retry; } - spin_lock(&cp->lock); - if ((cp->flags & IP_VS_CONN_F_NO_CPORT) && - (cp->flags & IP_VS_CONN_F_HASHED)) { + /* Fill cport once, even if multiple packets try to do it */ + if (cp->flags & IP_VS_CONN_F_NO_CPORT && (!cp->cport || by_me)) { + /* If we race with resizing make sure cport is set for dir 1 */ + if (!cp->cport) { + cp->cport = cport; + by_me = true; + } + if (!dir) { + atomic_dec(&ipvs->no_cport_conns[af_id]); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + } /* We do not recalc hash_key_r under lock, we assume the * parameters in cp do not change, i.e. cport is * the only possible change. @@ -726,21 +738,17 @@ retry: hlist_bl_del_rcu(&hn->node); hlist_bl_add_head_rcu(&hn->node, head_new); } - if (!dir) { - atomic_dec(&ipvs->no_cport_conns[af_id]); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; - } } - spin_unlock(&cp->lock); if (head != head2) hlist_bl_unlock(head2); hlist_bl_unlock(head); write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); preempt_enable_nested(); - spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); - if (dir--) + spin_unlock(&t->lock[hash_key & t->lock_mask].l); + + spin_unlock_bh(&cp->lock); + if (dir-- && by_me) goto next_dir; } @@ -1835,7 +1843,7 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) if (!rcu_dereference_protected(ipvs->conn_tab, 1)) return; - cancel_delayed_work_sync(&ipvs->conn_resize_work); + disable_delayed_work_sync(&ipvs->conn_resize_work); if (!atomic_read(&ipvs->conn_count)) goto unreg; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index f5b7a2047291..d40b404c1bf6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -237,7 +237,7 @@ int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, { if (!t) return 1 << min_bits; - n = roundup_pow_of_two(n); + n = n > 0 ? roundup_pow_of_two(n) : 1; if (lfactor < 0) { int factor = min(-lfactor, max_bits); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6632daa87ded..16daba8cac83 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -261,12 +261,28 @@ static void est_reload_work_handler(struct work_struct *work) if (!kd) continue; /* New config ? Stop kthread tasks */ - if (genid != genid_done) - ip_vs_est_kthread_stop(kd); + if (genid != genid_done) { + if (!id) { + /* Only we can stop kt 0 but not under mutex */ + mutex_unlock(&ipvs->est_mutex); + ip_vs_est_kthread_stop(kd); + mutex_lock(&ipvs->est_mutex); + if (!READ_ONCE(ipvs->enable)) + goto unlock; + /* kd for kt 0 is never destroyed */ + } else { + ip_vs_est_kthread_stop(kd); + } + } if (!kd->task && !ip_vs_est_stopped(ipvs)) { + bool start; + /* Do not start kthreads above 0 in calc phase */ - if ((!id || !ipvs->est_calc_phase) && - ip_vs_est_kthread_start(ipvs, kd) < 0) + if (id) + start = !ipvs->est_calc_phase; + else + start = kd->needed; + if (start && ip_vs_est_kthread_start(ipvs, kd) < 0) repeat = true; } } @@ -311,18 +327,22 @@ ip_vs_use_count_dec(void) /* Service hashing: * Operation Locking order * --------------------------------------------------------------------------- - * add table service_mutex, svc_resize_sem(W) - * del table service_mutex - * move between tables svc_resize_sem(W), seqcount_t(W), bit lock - * add/del service service_mutex, bit lock + * add first table service_mutex + * attach new table service_mutex + * add/del service service_mutex, RCU, bit lock + * move between tables (rehash) svc_resize_sem(W), seqcount_t(W), bit lock + * replace old with attached svc_resize_sem(W), svc_replace_sem(W) * find service RCU, seqcount_t(R) * walk services(blocking) service_mutex, svc_resize_sem(R) * walk services(non-blocking) RCU, seqcount_t(R) + * walk services(non-blocking) svc_resize_sem(R), RCU, seqcount_t(R) + * walk services(non-blocking) svc_replace_sem(R), RCU, seqcount_t(R) + * del table service_mutex after stopped work * - * - new tables are linked/unlinked under service_mutex and svc_resize_sem - * - new table is linked on resizing and all operations can run in parallel - * in 2 tables until the new table is registered as current one - * - two contexts can modify buckets: config and table resize, both in + * - new table is attached on resizing under service_mutex and all operations + * can run in parallel in 2 tables until the new table is registered as current + * one + * - two contexts can modify buckets: config and table resize (work), both in * process context * - only table resizer can move entries, so we do not protect t->seqc[] * items with t->lock[] @@ -330,9 +350,13 @@ ip_vs_use_count_dec(void) * services are moved to new table * - move operations may disturb readers: find operation will not miss entries * but walkers may see same entry twice if they are forced to retry chains - * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold - * service_mutex to disallow new tables to be installed or to check + * or to walk the newly attached second table + * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to check * svc_table_changes and repeat the RCU read section if new table is installed + * - walkers may serialize with the whole resizing process (svc_resize_sem) + * to prevent seeing same service twice or just with the svc_table + * replace (svc_replace_sem) when we can see entries twice but we + * prefer to run concurrently with the rehashing. */ /* @@ -371,9 +395,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* increase its refcnt because it is referenced by the svc table */ atomic_inc(&svc->refcnt); + /* We know if new table is attached under service_mutex but rely on + * RCU to hold the old table to be freed in resizer + */ + rcu_read_lock(); + + /* This can be the old or the new table */ + t = rcu_dereference(ipvs->svc_table); + /* New entries go into recent table */ - t = rcu_dereference_protected(ipvs->svc_table, 1); - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); if (svc->fwmark == 0) { /* @@ -394,6 +425,8 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) hlist_bl_add_head_rcu(&svc->s_list, head); hlist_bl_unlock(head); + rcu_read_unlock(); + return 1; } @@ -416,7 +449,13 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) return 0; } - t = rcu_dereference_protected(ipvs->svc_table, 1); + /* We know if new table is attached under service_mutex but rely on + * RCU to hold the old table to be freed in resizer + */ + rcu_read_lock(); + + /* This can be the old or the new table */ + t = rcu_dereference(ipvs->svc_table); hash_key = READ_ONCE(svc->hash_key); /* We need to lock the bucket in the right table */ if (ip_vs_rht_same_table(t, hash_key)) { @@ -427,13 +466,13 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) /* Moved to new table ? */ if (hash_key != hash_key2) { hlist_bl_unlock(head); - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); head = t->buckets + (hash_key2 & t->mask); hlist_bl_lock(head); } } else { /* It is already moved to new table */ - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); head = t->buckets + (hash_key & t->mask); hlist_bl_lock(head); } @@ -443,6 +482,8 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) svc->flags &= ~IP_VS_SVC_F_HASHED; atomic_dec(&svc->refcnt); hlist_bl_unlock(head); + + rcu_read_unlock(); return 1; } @@ -650,15 +691,14 @@ static void svc_resize_work_handler(struct work_struct *work) goto unlock_sem; more_work = false; clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags); - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + if (!READ_ONCE(ipvs->enable)) goto unlock_m; t = rcu_dereference_protected(ipvs->svc_table, 1); /* Do nothing if table is removed */ if (!t) goto unlock_m; - /* New table needs to be registered? BUG! */ - if (t != rcu_dereference_protected(t->new_tbl, 1)) + /* New table already attached? BUG! */ + if (t != rcu_access_pointer(t->new_tbl)) goto unlock_m; lfactor = sysctl_svc_lfactor(ipvs); @@ -675,6 +715,7 @@ static void svc_resize_work_handler(struct work_struct *work) /* Flip the table_id */ t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; + /* Attach new table */ rcu_assign_pointer(t->new_tbl, t_new); /* Allow add/del to new_tbl while moving from old table */ mutex_unlock(&ipvs->service_mutex); @@ -682,8 +723,8 @@ static void svc_resize_work_handler(struct work_struct *work) ip_vs_rht_for_each_bucket(t, bucket, head) { same_bucket: if (++limit >= 16) { - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, + /* Check if work is stopped */ + if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) goto unlock_sem; if (resched_score >= 100) { @@ -748,16 +789,12 @@ same_bucket: goto same_bucket; } - /* Tables can be switched only under service_mutex */ - while (!mutex_trylock(&ipvs->service_mutex)) { - cond_resched(); - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) - goto unlock_sem; - } - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) - goto unlock_m; + /* Serialize with readers that don't like svc_table changes */ + down_write(&ipvs->svc_replace_sem); + + /* Check if work is stopped to avoid synchronize_rcu() */ + if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + goto unlock_repl; rcu_assign_pointer(ipvs->svc_table, t_new); /* Inform readers that new table is installed */ @@ -765,8 +802,8 @@ same_bucket: atomic_inc(&ipvs->svc_table_changes); t_free = t; -unlock_m: - mutex_unlock(&ipvs->service_mutex); +unlock_repl: + up_write(&ipvs->svc_replace_sem); unlock_sem: up_write(&ipvs->svc_resize_sem); @@ -785,6 +822,11 @@ out: test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) return; queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); + return; + +unlock_m: + mutex_unlock(&ipvs->service_mutex); + goto unlock_sem; } static inline void @@ -1102,6 +1144,24 @@ out: return dest; } +/* Put destination in trash */ +static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs, + struct ip_vs_dest *dest, unsigned long istart, + bool cleanup) +{ + spin_lock_bh(&ipvs->dest_trash_lock); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + refcount_read(&dest->refcnt)); + if (list_empty(&ipvs->dest_trash) && !cleanup) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + /* dest lives in trash with reference */ + list_add(&dest->t_list, &ipvs->dest_trash); + dest->idle_start = istart; + spin_unlock_bh(&ipvs->dest_trash_lock); +} + static void ip_vs_dest_rcu_free(struct rcu_head *head) { struct ip_vs_dest *dest; @@ -1461,9 +1521,12 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ntohs(dest->vport)); ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + /* On error put back dest into the trash */ if (ret < 0) - return ret; - __ip_vs_update_dest(svc, dest, udest, 1); + ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start, + false); + else + __ip_vs_update_dest(svc, dest, udest, 1); } else { /* * Allocate and initialize the dest structure @@ -1533,17 +1596,7 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, */ ip_vs_rs_unhash(dest); - spin_lock_bh(&ipvs->dest_trash_lock); - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), - refcount_read(&dest->refcnt)); - if (list_empty(&ipvs->dest_trash) && !cleanup) - mod_timer(&ipvs->dest_trash_timer, - jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); - /* dest lives in trash with reference */ - list_add(&dest->t_list, &ipvs->dest_trash); - dest->idle_start = 0; - spin_unlock_bh(&ipvs->dest_trash_lock); + ip_vs_trash_put_dest(ipvs, dest, 0, cleanup); /* Queue up delayed work to expire all no destination connections. * No-op when CONFIG_SYSCTL is disabled. @@ -1664,6 +1717,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_pe *pe = NULL; int ret_hooks = -1; int ret = 0; + bool grow; /* increase the module use count */ if (!ip_vs_use_count_inc()) @@ -1705,16 +1759,25 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, } #endif - t = rcu_dereference_protected(ipvs->svc_table, 1); + /* The old table can be freed, protect it with RCU */ + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); if (!t) { int lfactor = sysctl_svc_lfactor(ipvs); int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor); + rcu_read_unlock(); t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); if (!t_new) { ret = -ENOMEM; goto out_err; } + grow = false; + } else { + /* Even the currently attached new table may need to grow */ + t = rcu_dereference(t->new_tbl); + grow = ip_vs_get_num_services(ipvs) + 1 > t->u_thresh; + rcu_read_unlock(); } if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { @@ -1773,6 +1836,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, goto out_err; if (t_new) { + /* Add table for first time */ clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); rcu_assign_pointer(ipvs->svc_table, t_new); t_new = NULL; @@ -1804,19 +1868,23 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ip_vs_svc_hash(svc); /* Schedule resize work */ - if (t && ip_vs_get_num_services(ipvs) > t->u_thresh && - !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) + if (grow && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); *svc_p = svc; if (!READ_ONCE(ipvs->enable)) { + mutex_lock(&ipvs->est_mutex); + /* Now there is a service - full throttle */ WRITE_ONCE(ipvs->enable, 1); + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + /* Start estimation for first time */ - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); + mutex_unlock(&ipvs->est_mutex); } return 0; @@ -1830,7 +1898,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, if (ret_hooks >= 0) ip_vs_unregister_hooks(ipvs, u->af); if (svc != NULL) { - ip_vs_unbind_scheduler(svc, sched); + ip_vs_unbind_scheduler(svc); ip_vs_service_free(svc); } ip_vs_scheduler_put(sched); @@ -1894,9 +1962,8 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) old_sched = rcu_dereference_protected(svc->scheduler, 1); if (sched != old_sched) { if (old_sched) { - ip_vs_unbind_scheduler(svc, old_sched); - RCU_INIT_POINTER(svc->scheduler, NULL); - /* Wait all svc->sched_data users */ + ip_vs_unbind_scheduler(svc); + /* Wait all svc->scheduler/sched_data users */ synchronize_rcu(); } /* Bind the new scheduler */ @@ -1904,6 +1971,10 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) ret = ip_vs_bind_scheduler(svc, sched); if (ret) { ip_vs_scheduler_put(sched); + /* Try to restore the old_sched */ + if (old_sched && + !ip_vs_bind_scheduler(svc, old_sched)) + old_sched = NULL; goto out; } } @@ -1959,7 +2030,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* Unbind scheduler */ old_sched = rcu_dereference_protected(svc->scheduler, 1); - ip_vs_unbind_scheduler(svc, old_sched); + ip_vs_unbind_scheduler(svc); ip_vs_scheduler_put(old_sched); /* Unbind persistence engine, keep svc->pe */ @@ -2022,7 +2093,6 @@ static int ip_vs_del_service(struct ip_vs_service *svc) return -EEXIST; ipvs = svc->ipvs; ip_vs_unlink_service(svc, false); - t = rcu_dereference_protected(ipvs->svc_table, 1); /* Drop the table if no more services */ ns = ip_vs_get_num_services(ipvs); @@ -2030,8 +2100,12 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* Stop the resizer and drop the tables */ set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); cancel_delayed_work_sync(&ipvs->svc_resize_work); + t = rcu_dereference_protected(ipvs->svc_table, 1); if (t) { rcu_assign_pointer(ipvs->svc_table, NULL); + /* Inform readers that table is removed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->svc_table_changes); while (1) { p = rcu_dereference_protected(t->new_tbl, 1); call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); @@ -2040,11 +2114,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc) t = p; } } - } else if (ns <= t->l_thresh && - !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, - &ipvs->work_flags)) { - queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, - 1); + } else { + bool shrink; + + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); + /* Even the currently attached new table may need to shrink */ + t = rcu_dereference(t->new_tbl); + shrink = ns <= t->l_thresh; + rcu_read_unlock(); + if (shrink && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, + &ipvs->work_flags)) + queue_delayed_work(system_unbound_wq, + &ipvs->svc_resize_work, 1); } return 0; } @@ -2078,6 +2160,9 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) t = rcu_dereference_protected(ipvs->svc_table, 1); if (t) { rcu_assign_pointer(ipvs->svc_table, NULL); + /* Inform readers that table is removed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->svc_table_changes); while (1) { p = rcu_dereference_protected(t->new_tbl, 1); call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); @@ -2086,6 +2171,11 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) t = p; } } + /* Stop the tot_stats estimator early under service_mutex + * to avoid locking it again later. + */ + if (cleanup) + ip_vs_stop_estimator_tot_stats(ipvs); return 0; } @@ -2141,17 +2231,21 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, struct ip_vs_service *svc; struct hlist_bl_node *e; struct ip_vs_dest *dest; - int old_gen, new_gen; + int old_gen; if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + /* Allow concurrent rehashing on resize but to avoid loop + * serialize with installing the new table. + */ + down_read(&ipvs->svc_replace_sem); + old_gen = atomic_read(&ipvs->svc_table_changes); rcu_read_lock(); -repeat: smp_rmb(); /* ipvs->svc_table and svc_table_changes */ ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { @@ -2164,17 +2258,17 @@ repeat: } resched_score++; if (resched_score >= 100) { - resched_score = 0; cond_resched_rcu(); - new_gen = atomic_read(&ipvs->svc_table_changes); - /* New table installed ? */ - if (old_gen != new_gen) { - old_gen = new_gen; - goto repeat; - } + /* Flushed? So no more dev refs */ + if (atomic_read(&ipvs->svc_table_changes) != old_gen) + goto done; + resched_score = 0; } } + +done: rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); return NOTIFY_DONE; } @@ -2201,6 +2295,10 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) struct ip_vs_service *svc; struct hlist_bl_node *e; + /* svc_table can not be replaced (svc_replace_sem) or + * removed (service_mutex) + */ + down_read(&ipvs->svc_replace_sem); rcu_read_lock(); ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { @@ -2216,6 +2314,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) } rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; @@ -2331,7 +2430,7 @@ static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, /* est_max_threads may depend on cpulist size */ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); ipvs->est_calc_phase = 1; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); unlock: mutex_unlock(&ipvs->est_mutex); @@ -2351,11 +2450,14 @@ static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, mutex_lock(&ipvs->est_mutex); - if (ipvs->est_cpulist_valid) - mask = *valp; - else - mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); - ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + /* HK_TYPE_KTHREAD cpumask needs RCU protection */ + scoped_guard(rcu) { + if (ipvs->est_cpulist_valid) + mask = *valp; + else + mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); + ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + } mutex_unlock(&ipvs->est_mutex); @@ -2411,7 +2513,7 @@ static int ipvs_proc_est_nice(const struct ctl_table *table, int write, mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); } mutex_unlock(&ipvs->est_mutex); } @@ -2438,7 +2540,7 @@ static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); } mutex_unlock(&ipvs->est_mutex); } @@ -2463,7 +2565,7 @@ static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write, if (val < -8 || val > 8) { ret = -EINVAL; } else { - *valp = val; + WRITE_ONCE(*valp, val); if (rcu_access_pointer(ipvs->conn_tab)) mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0); @@ -2490,10 +2592,16 @@ static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write, if (val < -8 || val > 8) { ret = -EINVAL; } else { - *valp = val; - if (rcu_access_pointer(ipvs->svc_table)) + mutex_lock(&ipvs->service_mutex); + WRITE_ONCE(*valp, val); + /* Make sure the services are present */ + if (rcu_access_pointer(ipvs->svc_table) && + READ_ONCE(ipvs->enable) && + !test_bit(IP_VS_WORK_SVC_NORESIZE, + &ipvs->work_flags)) mod_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 0); + mutex_unlock(&ipvs->service_mutex); } } return ret; @@ -3004,11 +3112,13 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) int old_gen, new_gen; u32 counts[8]; u32 bucket; - int count; + u32 count; + int loops; u32 sum1; u32 sum; int i; + /* Info for conns */ rcu_read_lock(); t = rcu_dereference(ipvs->conn_tab); @@ -3020,6 +3130,7 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) if (!atomic_read(&ipvs->conn_count)) goto after_conns; old_gen = atomic_read(&ipvs->conn_tab_changes); + loops = 0; repeat_conn: smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ @@ -3032,8 +3143,11 @@ repeat_conn: resched_score++; ip_vs_rht_walk_bucket_rcu(t, bucket, head) { count = 0; - hlist_bl_for_each_entry_rcu(hn, e, head, node) + hlist_bl_for_each_entry_rcu(hn, e, head, node) { count++; + if (count >= ARRAY_SIZE(counts) - 1) + break; + } } resched_score += count; if (resched_score >= 100) { @@ -3042,31 +3156,40 @@ repeat_conn: new_gen = atomic_read(&ipvs->conn_tab_changes); /* New table installed ? */ if (old_gen != new_gen) { + /* Too many changes? */ + if (++loops >= 5) + goto after_conns; old_gen = new_gen; goto repeat_conn; } } - counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + counts[count]++; } } for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) sum += counts[i]; sum1 = sum - counts[0]; - seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n", - counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n", + counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); for (i = 1; i < ARRAY_SIZE(counts); i++) { if (!counts[i]) continue; - seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n", + seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n", i, counts[i], - (unsigned long)counts[i] * 100 / max(sum1, 1U)); + div_u64((u64)counts[i] * 100U, max(sum1, 1U))); } after_conns: + rcu_read_unlock(); + + /* Info for services */ + down_read(&ipvs->svc_replace_sem); + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); count = ip_vs_get_num_services(ipvs); - seq_printf(seq, "Services:\t%d\n", count); + seq_printf(seq, "Services:\t%u\n", count); seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n", t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); @@ -3074,7 +3197,6 @@ after_conns: goto after_svc; old_gen = atomic_read(&ipvs->svc_table_changes); -repeat_svc: smp_rmb(); /* ipvs->svc_table and svc_table_changes */ memset(counts, 0, sizeof(counts)); ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) { @@ -3086,37 +3208,41 @@ repeat_svc: ip_vs_rht_walk_bucket_rcu(t, bucket, head) { count = 0; hlist_bl_for_each_entry_rcu(svc, e, head, - s_list) + s_list) { count++; + if (count >= ARRAY_SIZE(counts) - 1) + break; + } } resched_score += count; if (resched_score >= 100) { resched_score = 0; cond_resched_rcu(); - new_gen = atomic_read(&ipvs->svc_table_changes); - /* New table installed ? */ - if (old_gen != new_gen) { - old_gen = new_gen; - goto repeat_svc; - } + /* Flushed? */ + if (atomic_read(&ipvs->svc_table_changes) != + old_gen) + goto after_svc; } - counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + counts[count]++; } } for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) sum += counts[i]; sum1 = sum - counts[0]; - seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n", - counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n", + counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); for (i = 1; i < ARRAY_SIZE(counts); i++) { if (!counts[i]) continue; - seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n", + seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n", i, counts[i], - (unsigned long)counts[i] * 100 / max(sum1, 1U)); + div_u64((u64)counts[i] * 100U, max(sum1, 1U))); } after_svc: + rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); + seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n", ipvs->est_kt_count, ipvs->est_max_threads); seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max); @@ -3124,7 +3250,6 @@ after_svc: ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR * IPVS_EST_NTICKS); - rcu_read_unlock(); return 0; } @@ -3436,7 +3561,7 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs, int ret = 0; lockdep_assert_held(&ipvs->svc_resize_sem); - /* All service modifications are disabled, go ahead */ + /* All svc_table modifications are disabled, go ahead */ ip_vs_rht_walk_buckets(ipvs->svc_table, head) { hlist_bl_for_each_entry(svc, e, head, s_list) { /* Only expose IPv4 entries to old interface */ @@ -3620,7 +3745,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) pr_err("length: %u != %zu\n", *len, size); return -EINVAL; } - /* Protect against table resizer moving the entries. + /* Prevent modifications to the list with services. * Try reverse locking, so that we do not hold the mutex * while waiting for semaphore. */ @@ -3962,6 +4087,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, int start = cb->args[0]; int idx = 0; + /* Make sure we do not see same service twice during resize */ down_read(&ipvs->svc_resize_sem); rcu_read_lock(); ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) { @@ -4967,7 +5093,14 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); - ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + if (ipvs->tot_stats->s.est.ktid != -2) { + /* Not stopped yet? This happens only on netns init error and + * we even do not need to lock the service_mutex for this case. + */ + mutex_lock(&ipvs->service_mutex); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + mutex_unlock(&ipvs->service_mutex); + } if (ipvs->est_cpulist_valid) free_cpumask_var(ipvs->sysctl_est_cpulist); @@ -4998,6 +5131,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) /* Initialize service_mutex, svc_table per netns */ __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); init_rwsem(&ipvs->svc_resize_sem); + init_rwsem(&ipvs->svc_replace_sem); INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler); atomic_set(&ipvs->svc_table_changes, 0); RCU_INIT_POINTER(ipvs->svc_table, NULL); @@ -5039,7 +5173,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) ipvs->net->proc_net, ip_vs_stats_percpu_show, NULL)) goto err_percpu; - if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net, + if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net, ip_vs_status_show, NULL)) goto err_status; #endif diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 433ba3cab58c..ab09f5182951 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -68,6 +68,11 @@ and the limit of estimators per kthread - est_add_ktid: ktid where to add new ests, can point to empty slot where we should add kt data + - data protected by service_mutex: est_temp_list, est_add_ktid, + est_kt_count(R/W), est_kt_arr(R/W), est_genid_done, kd->needed(R/W) + - data protected by est_mutex: est_genid, est_max_threads, sysctl_est_cpulist, + est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation, + est_kt_count(R), est_kt_arr(R), kd->needed(R), kd->task (id > 0) */ static struct lock_class_key __ipvs_est_key; @@ -227,14 +232,17 @@ static int ip_vs_estimation_kthread(void *data) } /* Schedule stop/start for kthread tasks */ -void ip_vs_est_reload_start(struct netns_ipvs *ipvs) +void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart) { + lockdep_assert_held(&ipvs->est_mutex); + /* Ignore reloads before first service is added */ if (!READ_ONCE(ipvs->enable)) return; ip_vs_est_stopped_recalc(ipvs); - /* Bump the kthread configuration genid */ - atomic_inc(&ipvs->est_genid); + /* Bump the kthread configuration genid if stopping is requested */ + if (restart) + atomic_inc(&ipvs->est_genid); queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); } @@ -304,12 +312,17 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) void *arr = NULL; int i; - if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && - READ_ONCE(ipvs->enable) && ipvs->est_max_threads) - return -EINVAL; - mutex_lock(&ipvs->est_mutex); + /* Allow kt 0 data to be created before the services are added + * and limit the kthreads when services are present. + */ + if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) { + ret = -EINVAL; + goto out; + } + for (i = 0; i < id; i++) { if (!ipvs->est_kt_arr[i]) break; @@ -333,6 +346,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) kd->est_timer = jiffies; kd->id = id; ip_vs_est_set_params(ipvs, kd); + kd->needed = 1; /* Pre-allocate stats used in calc phase */ if (!id && !kd->calc_stats) { @@ -341,12 +355,8 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) goto out; } - /* Start kthread tasks only when services are present */ - if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { - ret = ip_vs_est_kthread_start(ipvs, kd); - if (ret < 0) - goto out; - } + /* Request kthread to be started */ + ip_vs_est_reload_start(ipvs, false); if (arr) ipvs->est_kt_count++; @@ -482,12 +492,11 @@ out: /* Start estimation for stats */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { + struct ip_vs_est_kt_data *kd = ipvs->est_kt_count > 0 ? + ipvs->est_kt_arr[0] : NULL; struct ip_vs_estimator *est = &stats->est; int ret; - if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) - ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); - est->ktid = -1; est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ @@ -496,8 +505,15 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) * will not allocate much memory, just for kt 0. */ ret = 0; - if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) + if (!kd) { ret = ip_vs_est_add_kthread(ipvs); + } else if (!kd->needed) { + mutex_lock(&ipvs->est_mutex); + /* We have job for the kt 0 task */ + kd->needed = 1; + ip_vs_est_reload_start(ipvs, true); + mutex_unlock(&ipvs->est_mutex); + } if (ret >= 0) hlist_add_head(&est->list, &ipvs->est_temp_list); else @@ -578,16 +594,14 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) } end_kt0: - /* kt 0 is freed after all other kthreads and chains are empty */ + /* kt 0 task is stopped after all other kt slots and chains are empty */ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { kd = ipvs->est_kt_arr[0]; - if (!kd || !kd->est_count) { + if (kd && !kd->est_count) { mutex_lock(&ipvs->est_mutex); - if (kd) { - ip_vs_est_kthread_destroy(kd); - ipvs->est_kt_arr[0] = NULL; - } - ipvs->est_kt_count--; + /* Keep the kt0 data but request kthread_stop */ + kd->needed = 0; + ip_vs_est_reload_start(ipvs, true); mutex_unlock(&ipvs->est_mutex); ipvs->est_add_ktid = 0; } @@ -647,9 +661,9 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) u64 val; INIT_HLIST_HEAD(&chain); - mutex_lock(&ipvs->service_mutex); + mutex_lock(&ipvs->est_mutex); kd = ipvs->est_kt_arr[0]; - mutex_unlock(&ipvs->service_mutex); + mutex_unlock(&ipvs->est_mutex); s = kd ? kd->calc_stats : NULL; if (!s) goto out; @@ -748,16 +762,16 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) if (!ip_vs_est_calc_limits(ipvs, &chain_max)) return; - mutex_lock(&ipvs->service_mutex); - /* Stop all other tasks, so that we can immediately move the * estimators to est_temp_list without RCU grace period */ mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ - if (!READ_ONCE(ipvs->enable)) - goto unlock2; + if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) { + mutex_unlock(&ipvs->est_mutex); + return; + } kd = ipvs->est_kt_arr[id]; if (!kd) continue; @@ -765,9 +779,11 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) } mutex_unlock(&ipvs->est_mutex); + mutex_lock(&ipvs->service_mutex); + /* Move all estimators to est_temp_list but carefully, * all estimators and kthread data can be released while - * we reschedule. Even for kthread 0. + * we reschedule. */ step = 0; @@ -849,9 +865,7 @@ walk_chain: ip_vs_stop_estimator(ipvs, stats); /* Tasks are stopped, move without RCU grace period */ est->ktid = -1; - est->ktrow = row - kd->est_row; - if (est->ktrow < 0) - est->ktrow += IPVS_EST_NTICKS; + est->ktrow = delay; hlist_add_head(&est->list, &ipvs->est_temp_list); /* kd freed ? */ if (last) @@ -889,7 +903,6 @@ end_dequeue: if (genid == atomic_read(&ipvs->est_genid)) ipvs->est_calc_phase = 0; -unlock2: mutex_unlock(&ipvs->est_mutex); unlock: diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index c6e421c4e299..24adc38942a0 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -56,19 +56,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, /* * Unbind a service with its scheduler */ -void ip_vs_unbind_scheduler(struct ip_vs_service *svc, - struct ip_vs_scheduler *sched) +void ip_vs_unbind_scheduler(struct ip_vs_service *svc) { - struct ip_vs_scheduler *cur_sched; + struct ip_vs_scheduler *sched; - cur_sched = rcu_dereference_protected(svc->scheduler, 1); - /* This check proves that old 'sched' was installed */ - if (!cur_sched) + sched = rcu_dereference_protected(svc->scheduler, 1); + if (!sched) return; + /* Reset the scheduler before initiating any RCU callbacks */ + rcu_assign_pointer(svc->scheduler, NULL); + smp_wmb(); /* paired with smp_rmb() in ip_vs_schedule() */ if (sched->done_service) sched->done_service(svc); - /* svc->scheduler can be set to NULL only by caller */ } diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 4f39bf7c843f..75e53fde6b29 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -72,6 +72,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, exp->flags = NF_CT_EXPECT_PERMANENT; exp->class = NF_CT_EXPECT_CLASS_DEFAULT; rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, NULL); write_pnet(&exp->net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index b08189226320..b521b5ebd664 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -568,6 +568,13 @@ static void destroy_gre_conntrack(struct nf_conn *ct) #endif } +static void warn_on_keymap_list_leak(const struct net *net) +{ +#ifdef CONFIG_NF_CT_PROTO_GRE + WARN_ON_ONCE(!list_empty(&net->ct.nf_ct_proto.gre.keymap_list)); +#endif +} + void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; @@ -1811,14 +1818,17 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { + struct nf_conntrack_helper *assign_helper; + /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ ct->master = exp->master; - if (exp->helper) { + assign_helper = rcu_dereference(exp->assign_helper); + if (assign_helper) { help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) - rcu_assign_pointer(help->helper, exp->helper); + rcu_assign_pointer(help->helper, assign_helper); } #ifdef CONFIG_NF_CONNTRACK_MARK @@ -2507,6 +2517,7 @@ i_see_dead_people: } list_for_each_entry(net, net_exit_list, exit_list) { + warn_on_keymap_list_leak(net); nf_conntrack_ecache_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); free_percpu(net->ct.stat); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 24d0576d84b7..8e943efbdf0a 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -344,6 +344,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, helper = rcu_dereference(help->helper); rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, NULL); write_pnet(&exp->net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 3f5c50455b71..b2fe6554b9cf 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -643,7 +643,7 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, &nf_conntrack_helper_h245); + rcu_assign_pointer(exp->assign_helper, &nf_conntrack_helper_h245); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -767,7 +767,7 @@ static int expect_callforwarding(struct sk_buff *skb, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -1234,7 +1234,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3 : NULL, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */ nathook = rcu_dereference(nfct_h323_nat_hook); @@ -1306,7 +1306,7 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_UDP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_ras); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_ras); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect RAS "); @@ -1523,7 +1523,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); @@ -1577,7 +1577,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index a715304a53d8..17e971bd4c74 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -321,8 +321,8 @@ __printf(3, 4) void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, const char *fmt, ...) { + const char *helper_name = "(null)"; const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; struct va_format vaf; va_list args; @@ -331,14 +331,17 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, vaf.fmt = fmt; vaf.va = &args; - /* Called from the helper function, this call never fails */ help = nfct_help(ct); + if (help) { + const struct nf_conntrack_helper *helper; - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); + helper = rcu_dereference(help->helper); + if (helper) + helper_name = helper->name; + } nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, - "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf); + "helper %s dropping packet: %pV ", helper_name, &vaf); va_end(args); } @@ -400,6 +403,11 @@ static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data) this = rcu_dereference_protected(exp->helper, lockdep_is_held(&nf_conntrack_expect_lock)); + if (this == me) + return true; + + this = rcu_dereference_protected(exp->assign_helper, + lockdep_is_held(&nf_conntrack_expect_lock)); return this == me; } diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 522183b9a604..2ebe4cb47cf6 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -203,7 +203,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, if (parse_dcc(data, data_limit, &dcc_ip, &dcc_port, &addr_beg_p, &addr_end_p)) { pr_debug("unable to parse dcc command\n"); - continue; + goto out; } pr_debug("DCC bound ip/port: %pI4:%u\n", @@ -217,7 +217,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n", &tuple->src.u3.ip, &dcc_ip, dcc_port); - continue; + goto out; } exp = nf_ct_expect_alloc(ct); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index eda5fe4a75c8..befa7e83ee49 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2634,6 +2634,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, + const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask); @@ -2860,6 +2861,7 @@ static int ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report) { + struct nf_conntrack_helper *assign_helper = NULL; struct nlattr *cda[CTA_EXPECT_MAX+1]; struct nf_conntrack_tuple tuple, mask; struct nf_conntrack_expect *exp; @@ -2870,13 +2872,26 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, if (err < 0) return err; + if (!cda[CTA_EXPECT_TUPLE] || !cda[CTA_EXPECT_MASK]) + return -EINVAL; + err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda, ct, &tuple, &mask); if (err < 0) return err; + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + assign_helper = __nf_conntrack_helper_find(helpname, + nf_ct_l3num(ct), + tuple.dst.protonum); + if (!assign_helper) + return -EOPNOTSUPP; + } + exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, - &tuple, &mask); + assign_helper, &tuple, &mask); if (IS_ERR(exp)) return PTR_ERR(exp); @@ -3515,6 +3530,7 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, + const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { @@ -3568,6 +3584,7 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, exp->zone = ct->zone; #endif rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, assign_helper); exp->tuple = *tuple; exp->mask.src.u3 = mask->src.u3; exp->mask.src.u.all = mask->src.u.all; @@ -3623,7 +3640,7 @@ ctnetlink_create_expect(struct net *net, ct = nf_ct_tuplehash_to_ctrack(h); rcu_read_lock(); - exp = ctnetlink_alloc_expect(cda, ct, &tuple, &mask); + exp = ctnetlink_alloc_expect(cda, ct, NULL, &tuple, &mask); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto err_rcu; diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 4c679638df06..dc23e4181618 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -225,13 +225,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) if (nf_ct_expect_related(exp_reply, 0) != 0) goto out_unexpect_orig; - /* Add GRE keymap entries */ - if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_ORIGINAL, &exp_orig->tuple) != 0) + if (!nf_ct_gre_keymap_add(ct, &exp_orig->tuple, + &exp_reply->tuple)) goto out_unexpect_both; - if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_REPLY, &exp_reply->tuple) != 0) { - nf_ct_gre_keymap_destroy(ct); - goto out_unexpect_both; - } ret = 0; out_put_both: diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 94c19bc4edc5..35e22082d65a 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -87,41 +87,97 @@ static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t) return key; } -/* add a single keymap entry, associate with specified master ct */ -int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, - struct nf_conntrack_tuple *t) +enum nf_ct_gre_km_act { + NF_CT_GRE_KM_NEW, + NF_CT_GRE_KM_BAD, + NF_CT_GRE_KM_DUP +}; + +static enum nf_ct_gre_km_act +nf_ct_gre_km_acceptable(const struct nf_ct_pptp_master *ct_pptp_info, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl) +{ + struct nf_ct_gre_keymap *km_orig, *km_repl; + + lockdep_assert_held(&keymap_lock); + + km_orig = ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL]; + km_repl = ct_pptp_info->keymap[IP_CT_DIR_REPLY]; + + if (km_orig && km_repl) { + if (!gre_key_cmpfn(km_orig, orig)) + return NF_CT_GRE_KM_BAD; + + if (!gre_key_cmpfn(km_repl, repl)) + return NF_CT_GRE_KM_BAD; + + return NF_CT_GRE_KM_DUP; + } + + DEBUG_NET_WARN_ON_ONCE(km_orig); + DEBUG_NET_WARN_ON_ONCE(km_repl); + return NF_CT_GRE_KM_NEW; +} + +/* add keymap entries, associate with specified master ct */ +bool nf_ct_gre_keymap_add(struct nf_conn *ct, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl) { struct net *net = nf_ct_net(ct); struct nf_gre_net *net_gre = gre_pernet(net); struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); - struct nf_ct_gre_keymap **kmp, *km; - - kmp = &ct_pptp_info->keymap[dir]; - if (*kmp) { - /* check whether it's a retransmission */ - list_for_each_entry_rcu(km, &net_gre->keymap_list, list) { - if (gre_key_cmpfn(km, t) && km == *kmp) - return 0; - } - pr_debug("trying to override keymap_%s for ct %p\n", - dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); - return -EEXIST; - } + struct nf_ct_gre_keymap *km_orig, *km_repl; + bool ret = false; - km = kmalloc_obj(*km, GFP_ATOMIC); - if (!km) - return -ENOMEM; - memcpy(&km->tuple, t, sizeof(*t)); - *kmp = km; + km_orig = kmalloc_obj(*km_orig, GFP_ATOMIC); + if (!km_orig) + return false; + km_repl = kmalloc_obj(*km_repl, GFP_ATOMIC); + if (!km_repl) + goto km_free; - pr_debug("adding new entry %p: ", km); - nf_ct_dump_tuple(&km->tuple); + memcpy(&km_orig->tuple, orig, sizeof(*orig)); + memcpy(&km_repl->tuple, repl, sizeof(*repl)); spin_lock_bh(&keymap_lock); - list_add_tail(&km->list, &net_gre->keymap_list); + if (nf_ct_is_dying(ct)) + goto unlock_free; + + switch (nf_ct_gre_km_acceptable(ct_pptp_info, orig, repl)) { + case NF_CT_GRE_KM_NEW: + break; + case NF_CT_GRE_KM_DUP: + ret = true; + goto unlock_free; + case NF_CT_GRE_KM_BAD: + pr_debug("trying to override keymap for ct %p\n", ct); + goto unlock_free; + } + + if (ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL] || + ct_pptp_info->keymap[IP_CT_DIR_REPLY]) + goto unlock_free; + + pr_debug("adding new entries %p,%p: ", km_orig, km_repl); + nf_ct_dump_tuple(&km_orig->tuple); + nf_ct_dump_tuple(&km_repl->tuple); + + list_add_tail_rcu(&km_orig->list, &net_gre->keymap_list); + list_add_tail_rcu(&km_repl->list, &net_gre->keymap_list); + ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL] = km_orig; + ct_pptp_info->keymap[IP_CT_DIR_REPLY] = km_repl; spin_unlock_bh(&keymap_lock); - return 0; + return true; + +unlock_free: + spin_unlock_bh(&keymap_lock); +km_free: + kfree(km_orig); + kfree(km_repl); + return ret; } EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b67426c2189b..e99ab1e88e9f 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1221,7 +1221,8 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, new_state = old_state; } if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) - && ct->proto.tcp.last_index == TCP_SYN_SET) + && ct->proto.tcp.last_index == TCP_SYN_SET + && ct->proto.tcp.last_dir != dir) || (!test_bit(IPS_ASSURED_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 1eb55907d470..e69941f1a101 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1366,6 +1366,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, goto store_cseq; } + helper = rcu_dereference(nfct_help(ct)->helper); + if (!helper) + return NF_DROP; + exp = nf_ct_expect_alloc(ct); if (!exp) { nf_ct_helper_log(skb, ct, "cannot alloc expectation"); @@ -1376,14 +1380,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, if (sip_direct_signalling) saddr = &ct->tuplehash[!dir].tuple.src.u3; - helper = rcu_dereference(nfct_help(ct)->helper); - if (!helper) - return NF_DROP; - nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), saddr, &daddr, proto, NULL, &port); exp->timeout.expires = sip_timeout * HZ; - rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, helper); exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; hooks = rcu_dereference(nf_nat_sip_hooks); diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index e348fb90b8dc..3b0a70e154cd 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -13,22 +13,6 @@ #include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> -#define NF_RECURSION_LIMIT 2 - -#ifndef CONFIG_PREEMPT_RT -static u8 *nf_get_nf_dup_skb_recursion(void) -{ - return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); -} -#else - -static u8 *nf_get_nf_dup_skb_recursion(void) -{ - return ¤t->net_xmit.nf_dup_skb_recursion; -} - -#endif - static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, enum nf_dev_hooks hook) { diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 2c4140e6f53c..785d8c244a77 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -122,6 +122,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->tun = route->tuple[dir].in.tun; flow_tuple->encap_num = route->tuple[dir].in.num_encaps; + flow_tuple->needs_gso_segment = route->tuple[dir].out.needs_gso_segment; flow_tuple->tun_num = route->tuple[dir].in.num_tuns; switch (route->tuple[dir].xmit_type) { diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index fd56d663cb5b..9c05a50d6013 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -445,13 +445,13 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx, switch (skb->protocol) { case htons(ETH_P_8021Q): vlan_hdr = (struct vlan_hdr *)skb->data; - __skb_pull(skb, VLAN_HLEN); + skb_pull_rcsum(skb, VLAN_HLEN); vlan_set_encap_proto(skb, vlan_hdr); skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): skb->protocol = __nf_flow_pppoe_proto(skb); - skb_pull(skb, PPPOE_SES_HLEN); + skb_pull_rcsum(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; } @@ -462,23 +462,6 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx, nf_flow_ip_tunnel_pop(ctx, skb); } -struct nf_flow_xmit { - const void *dest; - const void *source; - struct net_device *outdev; -}; - -static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, - struct nf_flow_xmit *xmit) -{ - skb->dev = xmit->outdev; - dev_hard_header(skb, skb->dev, ntohs(skb->protocol), - xmit->dest, xmit->source, skb->len); - dev_queue_xmit(skb); - - return NF_STOLEN; -} - static struct flow_offload_tuple_rhash * nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, struct nf_flowtable *flow_table, struct sk_buff *skb) @@ -524,7 +507,7 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, return 0; } - if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + if (skb_ensure_writable(skb, thoff + ctx->hdrsize)) return -1; flow_offload_refresh(flow_table, flow, false); @@ -544,7 +527,34 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, return 1; } -static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) +/* Similar to skb_vlan_push. */ +static int nf_flow_vlan_push(struct sk_buff *skb, __be16 proto, u16 id, + u32 needed_headroom) +{ + if (skb_vlan_tag_present(skb)) { + struct vlan_hdr *vhdr; + + if (skb_cow_head(skb, needed_headroom + VLAN_HLEN)) + return -1; + + __skb_push(skb, VLAN_HLEN); + if (skb_mac_header_was_set(skb)) + skb->mac_header -= VLAN_HLEN; + + vhdr = (struct vlan_hdr *)skb->data; + skb->network_header -= VLAN_HLEN; + vhdr->h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + vhdr->h_vlan_encapsulated_proto = skb->protocol; + skb->protocol = skb->vlan_proto; + skb_postpush_rcsum(skb, skb->data, VLAN_HLEN); + } + __vlan_hwaccel_put_tag(skb, proto, id); + + return 0; +} + +static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id, + u32 needed_headroom) { int data_len = skb->len + sizeof(__be16); struct ppp_hdr { @@ -553,7 +563,7 @@ static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) } *ph; __be16 proto; - if (skb_cow_head(skb, PPPOE_SES_HLEN)) + if (skb_cow_head(skb, needed_headroom + PPPOE_SES_HLEN)) return -1; switch (skb->protocol) { @@ -730,21 +740,24 @@ static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb, } static int nf_flow_encap_push(struct sk_buff *skb, - struct flow_offload_tuple *tuple) + struct flow_offload_tuple *tuple, + struct net_device *outdev) { + u32 needed_headroom = LL_RESERVED_SPACE(outdev); int i; - for (i = 0; i < tuple->encap_num; i++) { + for (i = tuple->encap_num - 1; i >= 0; i--) { switch (tuple->encap[i].proto) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): - skb_reset_mac_header(skb); - if (skb_vlan_push(skb, tuple->encap[i].proto, - tuple->encap[i].id) < 0) + if (nf_flow_vlan_push(skb, tuple->encap[i].proto, + tuple->encap[i].id, + needed_headroom) < 0) return -1; break; case htons(ETH_P_PPP_SES): - if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0) + if (nf_flow_pppoe_push(skb, tuple->encap[i].id, + needed_headroom) < 0) return -1; break; } @@ -753,6 +766,76 @@ static int nf_flow_encap_push(struct sk_buff *skb, return 0; } +struct nf_flow_xmit { + const void *dest; + const void *source; + struct net_device *outdev; + struct flow_offload_tuple *tuple; + bool needs_gso_segment; +}; + +static void __nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, + struct nf_flow_xmit *xmit) +{ + struct net_device *dev = xmit->outdev; + unsigned int hh_len = LL_RESERVED_SPACE(dev); + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) + return; + } + + skb->dev = dev; + dev_hard_header(skb, dev, ntohs(skb->protocol), + xmit->dest, xmit->source, skb->len); + dev_queue_xmit(skb); +} + +static unsigned int nf_flow_encap_gso_xmit(struct net *net, struct sk_buff *skb, + struct nf_flow_xmit *xmit) +{ + struct sk_buff *segs, *nskb; + + segs = skb_gso_segment(skb, 0); + if (IS_ERR(segs)) + return NF_DROP; + + if (segs) + consume_skb(skb); + else + segs = skb; + + skb_list_walk_safe(segs, segs, nskb) { + skb_mark_not_on_list(segs); + + if (nf_flow_encap_push(segs, xmit->tuple, xmit->outdev) < 0) { + kfree_skb(segs); + kfree_skb_list(nskb); + return NF_STOLEN; + } + __nf_flow_queue_xmit(net, segs, xmit); + } + + return NF_STOLEN; +} + +static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, + struct nf_flow_xmit *xmit) +{ + if (xmit->tuple->encap_num) { + if (skb_is_gso(skb) && xmit->needs_gso_segment) + return nf_flow_encap_gso_xmit(net, skb, xmit); + + if (nf_flow_encap_push(skb, xmit->tuple, xmit->outdev) < 0) + return NF_DROP; + } + + __nf_flow_queue_xmit(net, skb, xmit); + + return NF_STOLEN; +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -797,9 +880,6 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) return NF_DROP; - if (nf_flow_encap_push(skb, other_tuple) < 0) - return NF_DROP; - switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rtable(tuplehash->tuple.dst_cache); @@ -829,6 +909,8 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, WARN_ON_ONCE(1); return NF_DROP; } + xmit.tuple = other_tuple; + xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment; return nf_flow_queue_xmit(state->net, skb, &xmit); } @@ -1037,7 +1119,7 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, return 0; } - if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + if (skb_ensure_writable(skb, thoff + ctx->hdrsize)) return -1; flow_offload_refresh(flow_table, flow, false); @@ -1119,9 +1201,6 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, &ip6_daddr, encap_limit) < 0) return NF_DROP; - if (nf_flow_encap_push(skb, other_tuple) < 0) - return NF_DROP; - switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rt6_info(tuplehash->tuple.dst_cache); @@ -1151,6 +1230,8 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, WARN_ON_ONCE(1); return NF_DROP; } + xmit.tuple = other_tuple; + xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment; return nf_flow_queue_xmit(state->net, skb, &xmit); } diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c index 6bb9579dcc2a..9e88ea6a2eef 100644 --- a/net/netfilter/nf_flow_table_path.c +++ b/net/netfilter/nf_flow_table_path.c @@ -86,6 +86,7 @@ struct nft_forward_info { u8 ingress_vlans; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; + bool needs_gso_segment; enum flow_offload_xmit_type xmit_type; }; @@ -138,8 +139,11 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, path->encap.proto; info->num_encaps++; } - if (path->type == DEV_PATH_PPPOE) + if (path->type == DEV_PATH_PPPOE) { memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; + info->needs_gso_segment = 1; + } break; case DEV_PATH_BRIDGE: if (is_zero_ether_addr(info->h_source)) @@ -279,6 +283,7 @@ static void nft_dev_forward_path(const struct nft_pktinfo *pkt, memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); route->tuple[dir].xmit_type = info.xmit_type; } + route->tuple[dir].out.needs_gso_segment = info.needs_gso_segment; } int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index a6c81c04b3a5..57b450024a99 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -61,6 +61,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) struct nf_hook_state *state = &entry->state; /* Release those devices we held, or Alexey will kill me. */ + dev_put(entry->skb_dev); dev_put(state->in); dev_put(state->out); if (state->sk) @@ -102,6 +103,7 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) return false; + dev_hold(entry->skb_dev); dev_hold(state->in); dev_hold(state->out); @@ -202,11 +204,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, *entry = (struct nf_queue_entry) { .skb = skb, + .skb_dev = skb->dev, .state = *state, .hook_index = index, .size = sizeof(*entry) + route_key_size, }; - __nf_queue_entry_init_physdevs(entry); if (!nf_queue_entry_get_refs(entry)) { diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 57f57e2fc80a..ed00114f65f3 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -22,6 +22,8 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_synproxy.h> +static DEFINE_MUTEX(synproxy_mutex); + unsigned int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); @@ -200,6 +202,8 @@ synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff, if (skb_ensure_writable(skb, optend)) return 0; + th = (struct tcphdr *)(skb->data + protoff); + while (optoff < optend) { unsigned char *op = skb->data + optoff; @@ -767,26 +771,31 @@ static const struct nf_hook_ops ipv4_synproxy_ops[] = { int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net) { - int err; + int err = 0; + mutex_lock(&synproxy_mutex); if (snet->hook_ref4 == 0) { err = nf_register_net_hooks(net, ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); if (err) - return err; + goto out; } snet->hook_ref4++; - return 0; +out: + mutex_unlock(&synproxy_mutex); + return err; } EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init); void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net) { + mutex_lock(&synproxy_mutex); snet->hook_ref4--; if (snet->hook_ref4 == 0) nf_unregister_net_hooks(net, ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); + mutex_unlock(&synproxy_mutex); } EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini); @@ -1191,27 +1200,32 @@ static const struct nf_hook_ops ipv6_synproxy_ops[] = { int nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) { - int err; + int err = 0; + mutex_lock(&synproxy_mutex); if (snet->hook_ref6 == 0) { err = nf_register_net_hooks(net, ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); if (err) - return err; + goto out; } snet->hook_ref6++; - return 0; +out: + mutex_unlock(&synproxy_mutex); + return err; } EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init); void nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net) { + mutex_lock(&synproxy_mutex); snet->hook_ref6--; if (snet->hook_ref6 == 0) nf_unregister_net_hooks(net, ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); + mutex_unlock(&synproxy_mutex); } EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini); #endif /* CONFIG_IPV6 */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d20ce5c36d31..87387adbca65 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -407,6 +407,7 @@ static void nft_netdev_unregister_trans_hook(struct net *net, } static void nft_netdev_unregister_hooks(struct net *net, + const struct nft_table *table, struct list_head *hook_list, bool release_netdev) { @@ -414,8 +415,10 @@ static void nft_netdev_unregister_hooks(struct net *net, struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - list_for_each_entry(ops, &hook->ops_list, list) - nf_unregister_net_hook(net, ops); + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); + } if (release_netdev) nft_netdev_hook_unlink_free_rcu(hook); } @@ -452,20 +455,25 @@ static void __nf_tables_unregister_hook(struct net *net, struct nft_base_chain *basechain; const struct nf_hook_ops *ops; - if (table->flags & NFT_TABLE_F_DORMANT || - !nft_is_base_chain(chain)) + if (!nft_is_base_chain(chain)) return; basechain = nft_base_chain(chain); ops = &basechain->ops; + /* must also be called for dormant tables */ + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { + nft_netdev_unregister_hooks(net, table, &basechain->hook_list, + release_netdev); + return; + } + + if (table->flags & NFT_TABLE_F_DORMANT) + return; + if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); - if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) - nft_netdev_unregister_hooks(net, &basechain->hook_list, - release_netdev); - else - nf_unregister_net_hook(net, &basechain->ops); + nf_unregister_net_hook(net, &basechain->ops); } static void nf_tables_unregister_hook(struct net *net, @@ -4205,6 +4213,7 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) struct nft_chain *chain; struct nft_ctx ctx = { .net = net, + .table = (struct nft_table *)table, .family = table->family, }; int err = 0; @@ -11281,11 +11290,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { - if (!(table->flags & NFT_TABLE_F_DORMANT)) { - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); - } + nft_netdev_unregister_hooks(net, table, + &nft_trans_chain_hooks(trans), + true); free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 5ddd5b6e135f..8ab186f86dd4 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -153,7 +153,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) ptr = skb_network_header(skb) + pkt->nhoff; else { - if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) return false; ptr = skb->data + nft_thoff(pkt); } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 58304fd1f70f..60ab88d45096 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1141,6 +1141,9 @@ nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int di { struct sk_buff *nskb; + if (e->state.net->user_ns != &init_user_ns) + return -EPERM; + if (diff < 0) { unsigned int min_len = skb_transport_offset(e->skb); @@ -1212,6 +1215,8 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) if (physinif == ifindex || physoutif == ifindex) return 1; #endif + if (entry->skb_dev && entry->skb_dev->ifindex == ifindex) + return 1; if (entry->state.in) if (entry->state.in->ifindex == ifindex) return 1; @@ -1535,8 +1540,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), payload_len, entry, diff) < 0) verdict = NF_DROP; - - if (ct && diff) + else if (ct && diff) nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff); } diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 94dccdcfa06b..785b8e9731d1 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -43,8 +43,10 @@ static void nft_bitwise_eval_lshift(u32 *dst, const u32 *src, u32 carry = 0; for (i = DIV_ROUND_UP(priv->len, sizeof(u32)); i > 0; i--) { - dst[i - 1] = (src[i - 1] << shift) | carry; - carry = src[i - 1] >> (BITS_PER_TYPE(u32) - shift); + u32 tmp_src = src[i - 1]; + + dst[i - 1] = (tmp_src << shift) | carry; + carry = tmp_src >> (BITS_PER_TYPE(u32) - shift); } } @@ -56,8 +58,10 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, u32 carry = 0; for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) { - dst[i] = carry | (src[i] >> shift); - carry = src[i] << (BITS_PER_TYPE(u32) - shift); + u32 tmp_src = src[i]; + + dst[i] = carry | (tmp_src >> shift); + carry = tmp_src << (BITS_PER_TYPE(u32) - shift); } } @@ -235,6 +239,9 @@ static int nft_bitwise_init_bool(const struct nft_ctx *ctx, &priv->sreg2, priv->len); if (err < 0) return err; + + if (nft_reg_overlap(priv->sreg2, priv->dreg, priv->len)) + return -EINVAL; } return 0; @@ -265,6 +272,9 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, if (err < 0) return err; + if (nft_reg_overlap(priv->sreg, priv->dreg, priv->len)) + return -EINVAL; + if (tb[NFTA_BITWISE_OP]) { priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); switch (priv->op) { diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index e00dddfa2fc0..dfd41fc8d9b8 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -19,7 +19,6 @@ struct nft_byteorder { u8 sreg; u8 dreg; enum nft_byteorder_ops op:8; - u8 len; u8 size; }; @@ -28,13 +27,8 @@ void nft_byteorder_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { const struct nft_byteorder *priv = nft_expr_priv(expr); - u32 *src = ®s->data[priv->sreg]; + const u32 *src = ®s->data[priv->sreg]; u32 *dst = ®s->data[priv->dreg]; - u16 *s16, *d16; - unsigned int i; - - s16 = (void *)src; - d16 = (void *)dst; switch (priv->size) { case 8: { @@ -43,18 +37,14 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: - for (i = 0; i < priv->len / 8; i++) { - src64 = nft_reg_load64(&src[i]); - nft_reg_store64(&dst64[i], - be64_to_cpu((__force __be64)src64)); - } + src64 = nft_reg_load64(src); + + nft_reg_store64(dst64, be64_to_cpu((__force __be64)src64)); break; case NFT_BYTEORDER_HTON: - for (i = 0; i < priv->len / 8; i++) { - src64 = (__force __u64) - cpu_to_be64(nft_reg_load64(&src[i])); - nft_reg_store64(&dst64[i], src64); - } + src64 = (__force __u64)cpu_to_be64(nft_reg_load64(src)); + + nft_reg_store64(dst64, src64); break; } break; @@ -62,24 +52,20 @@ void nft_byteorder_eval(const struct nft_expr *expr, case 4: switch (priv->op) { case NFT_BYTEORDER_NTOH: - for (i = 0; i < priv->len / 4; i++) - dst[i] = ntohl((__force __be32)src[i]); + *dst = ntohl((__force __be32)*src); break; case NFT_BYTEORDER_HTON: - for (i = 0; i < priv->len / 4; i++) - dst[i] = (__force __u32)htonl(src[i]); + *dst = (__force __u32)htonl(*src); break; } break; case 2: switch (priv->op) { case NFT_BYTEORDER_NTOH: - for (i = 0; i < priv->len / 2; i++) - d16[i] = ntohs((__force __be16)s16[i]); + nft_reg_store16(dst, ntohs(nft_reg_load_be16(src))); break; case NFT_BYTEORDER_HTON: - for (i = 0; i < priv->len / 2; i++) - d16[i] = (__force __u16)htons(s16[i]); + nft_reg_store_be16(dst, htons(nft_reg_load16(src))); break; } break; @@ -137,16 +123,25 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, if (err < 0) return err; - priv->len = len; + /* no longer support multi-reg conversions */ + if (len != size) + return -EOPNOTSUPP; err = nft_parse_register_load(ctx, tb[NFTA_BYTEORDER_SREG], &priv->sreg, - priv->len); + len); + if (err < 0) + return err; + + err = nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + len); if (err < 0) return err; - return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG], - &priv->dreg, NULL, NFT_DATA_VALUE, - priv->len); + if (nft_reg_overlap(priv->sreg, priv->dreg, len)) + return -EINVAL; + + return 0; } static int nft_byteorder_dump(struct sk_buff *skb, @@ -160,10 +155,11 @@ static int nft_byteorder_dump(struct sk_buff *skb, goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op))) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len))) - goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size))) goto nla_put_failure; + /* compatibility for old userspace which permitted size != len */ + if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->size))) + goto nla_put_failure; return 0; nla_put_failure: diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index decc725a33c2..0caa9304d2d0 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -261,10 +261,10 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return ret; } - nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); - nft_compat_wait_for_destructors(ctx->net); + nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); + ret = xt_check_target(&par, size, proto, inv); if (ret < 0) { if (ret == -ENOENT) { @@ -353,8 +353,6 @@ nla_put_failure: static int nft_target_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { - struct xt_target *target = expr->ops->data; - unsigned int hook_mask = 0; int ret; if (ctx->family != NFPROTO_IPV4 && @@ -377,11 +375,21 @@ static int nft_target_validate(const struct nft_ctx *ctx, const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; + unsigned int hook_mask = 1 << ops->hooknum; + struct xt_target *target = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_tgchk_param par; + union nft_entry e = {}; - hook_mask = 1 << ops->hooknum; if (target->hooks && !(hook_mask & target->hooks)) return -EINVAL; + nft_target_set_tgchk_param(&par, ctx, target, info, &e, 0, false); + + ret = xt_check_hooks_target(&par); + if (ret < 0) + return ret; + ret = nft_compat_chain_validate_dependency(ctx, target->table); if (ret < 0) return ret; @@ -515,10 +523,10 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return ret; } - nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); - nft_compat_wait_for_destructors(ctx->net); + nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); + return xt_check_match(&par, size, proto, inv); } @@ -614,8 +622,6 @@ static int nft_match_large_dump(struct sk_buff *skb, static int nft_match_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { - struct xt_match *match = expr->ops->data; - unsigned int hook_mask = 0; int ret; if (ctx->family != NFPROTO_IPV4 && @@ -638,11 +644,30 @@ static int nft_match_validate(const struct nft_ctx *ctx, const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; + unsigned int hook_mask = 1 << ops->hooknum; + struct xt_match *match = expr->ops->data; + size_t size = XT_ALIGN(match->matchsize); + struct xt_mtchk_param par; + union nft_entry e = {}; + void *info; - hook_mask = 1 << ops->hooknum; if (match->hooks && !(hook_mask & match->hooks)) return -EINVAL; + if (NFT_EXPR_SIZE(size) > NFT_MATCH_LARGE_THRESH) { + struct nft_xt_match_priv *priv = nft_expr_priv(expr); + + info = priv->info; + } else { + info = nft_expr_priv(expr); + } + + nft_match_set_mtchk_param(&par, ctx, match, info, &e, 0, false); + + ret = xt_check_hooks_match(&par); + if (ret < 0) + return ret; + ret = nft_compat_chain_validate_dependency(ctx, match->table); if (ret < 0) return ret; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 60ee8d932fcb..357513c6dcea 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -78,7 +78,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, break; } - if (ct == NULL) + if (!ct || nf_ct_is_template(ct)) goto err; switch (priv->key) { @@ -180,12 +180,10 @@ static void nft_ct_get_eval(const struct nft_expr *expr, tuple = &ct->tuplehash[priv->dir].tuple; switch (priv->key) { case NFT_CT_SRC: - memcpy(dest, tuple->src.u3.all, - nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); + memcpy(dest, tuple->src.u3.all, priv->len); return; case NFT_CT_DST: - memcpy(dest, tuple->dst.u3.all, - nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); + memcpy(dest, tuple->dst.u3.all, priv->len); return; case NFT_CT_PROTO_SRC: nft_reg_store16(dest, (__force u16)tuple->src.u.all); @@ -1334,6 +1332,8 @@ static void nft_ct_expect_obj_eval(struct nft_object *obj, if (nf_ct_expect_related(exp, 0) != 0) regs->verdict.code = NF_DROP; + + nf_ct_expect_put(exp); } static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { diff --git a/net/netfilter/nft_ct_fast.c b/net/netfilter/nft_ct_fast.c index e684c8a91848..ecf7b3a404be 100644 --- a/net/netfilter/nft_ct_fast.c +++ b/net/netfilter/nft_ct_fast.c @@ -30,7 +30,7 @@ void nft_ct_get_fast_eval(const struct nft_expr *expr, break; } - if (!ct) { + if (!ct || nf_ct_is_template(ct)) { regs->verdict.code = NFT_BREAK; return; } diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 0407d6f708ae..e6a07c0df207 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -376,7 +376,7 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; - if (pkt->tprot != IPPROTO_SCTP) + if (pkt->tprot != IPPROTO_SCTP || pkt->fragoff) goto err; do { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 4bce36c3a6a0..b9e88d7cf308 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -95,12 +95,15 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { + u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); struct nft_fwd_neigh *priv = nft_expr_priv(expr); void *addr = ®s->data[priv->sreg_addr]; int oif = regs->data[priv->sreg_dev]; unsigned int verdict = NF_STOLEN; struct sk_buff *skb = pkt->skb; + int nhoff = skb_network_offset(skb); struct net_device *dev; + unsigned int hh_len; int neigh_table; switch (priv->nfproto) { @@ -111,7 +114,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, verdict = NFT_BREAK; goto out; } - if (skb_try_make_writable(skb, sizeof(*iph))) { + if (skb_ensure_writable(skb, nhoff + sizeof(*iph))) { verdict = NF_DROP; goto out; } @@ -132,7 +135,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, verdict = NFT_BREAK; goto out; } - if (skb_try_make_writable(skb, sizeof(*ip6h))) { + if (skb_ensure_writable(skb, nhoff + sizeof(*ip6h))) { verdict = NF_DROP; goto out; } @@ -151,13 +154,31 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, goto out; } + if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) { + verdict = NF_DROP; + goto out; + } + dev = dev_get_by_index_rcu(nft_net(pkt), oif); - if (dev == NULL) - return; + if (dev == NULL) { + verdict = NF_DROP; + goto out; + } + + hh_len = LL_RESERVED_SPACE(dev); + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) { + verdict = NF_STOLEN; + goto out; + } + } skb->dev = dev; skb_clear_tstamp(skb); + (*nf_dup_skb_recursion)++; neigh_xmit(neigh_table, dev, addr, skb); + (*nf_dup_skb_recursion)--; out: regs->verdict.code = verdict; } diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 03ffb1159fc1..d14ca157910b 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -163,7 +163,6 @@ static int nft_inner_parse_l2l3(const struct nft_inner *priv, return -1; if (fragoff == 0) { - thoff = nhoff + sizeof(_ip6h); ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; ctx->inner_thoff = thoff; ctx->l4proto = l4proto; @@ -247,8 +246,8 @@ static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { - local_bh_enable(); local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + local_bh_enable(); return false; } *tun_ctx = *this_cpu_tun_ctx; diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index c02d5cb52143..45fe56da5044 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -33,7 +33,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - if (pkt->tprot != IPPROTO_TCP) { + if (pkt->tprot != IPPROTO_TCP || pkt->fragoff) { regs->verdict.code = NFT_BREAK; return; } diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 01e13e5255a9..484a5490832e 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -917,6 +917,9 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, struct nft_payload_set *priv = nft_expr_priv(expr); int err; + if (ctx->net->user_ns != &init_user_ns) + return -EPERM; + priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index f2101af8c867..89be443734f6 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -30,8 +30,8 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, __be16 tport = 0; struct sock *sk; - if (pkt->tprot != IPPROTO_TCP && - pkt->tprot != IPPROTO_UDP) { + if ((pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) || pkt->fragoff) { regs->verdict.code = NFT_BREAK; return; } @@ -97,8 +97,8 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, memset(&taddr, 0, sizeof(taddr)); - if (pkt->tprot != IPPROTO_TCP && - pkt->tprot != IPPROTO_UDP) { + if ((pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) || pkt->fragoff) { regs->verdict.code = NFT_BREAK; return; } diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 0b987bc2132a..68f7cfbbee06 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -676,7 +676,7 @@ static void nft_tunnel_obj_destroy(const struct nft_ctx *ctx, { struct nft_tunnel_obj *priv = nft_obj_data(obj); - metadata_dst_free(priv->md); + dst_release(&priv->md->dst); } static struct nft_object_type nft_tunnel_obj_type; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 9f837fb5ceb4..4e6708c23922 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -55,6 +55,9 @@ static struct list_head xt_templates[NFPROTO_NUMPROTO]; struct xt_pernet { struct list_head tables[NFPROTO_NUMPROTO]; + + /* stash area used during netns exit */ + struct list_head dead_tables[NFPROTO_NUMPROTO]; }; struct compat_delta { @@ -477,11 +480,9 @@ int xt_check_proc_name(const char *name, unsigned int size) } EXPORT_SYMBOL(xt_check_proc_name); -int xt_check_match(struct xt_mtchk_param *par, - unsigned int size, u16 proto, bool inv_proto) +static int xt_check_match_common(struct xt_mtchk_param *par, + unsigned int size, u16 proto, bool inv_proto) { - int ret; - if (XT_ALIGN(par->match->matchsize) != size && par->match->matchsize != -1) { /* @@ -530,6 +531,14 @@ int xt_check_match(struct xt_mtchk_param *par, par->match->proto); return -EINVAL; } + + return 0; +} + +static int xt_checkentry_match(struct xt_mtchk_param *par) +{ + int ret; + if (par->match->checkentry != NULL) { ret = par->match->checkentry(par); if (ret < 0) @@ -538,8 +547,34 @@ int xt_check_match(struct xt_mtchk_param *par, /* Flag up potential errors. */ return -EIO; } + + return 0; +} + +int xt_check_hooks_match(struct xt_mtchk_param *par) +{ + if (par->match->check_hooks != NULL) + return par->match->check_hooks(par); + return 0; } +EXPORT_SYMBOL_GPL(xt_check_hooks_match); + +int xt_check_match(struct xt_mtchk_param *par, + unsigned int size, u16 proto, bool inv_proto) +{ + int ret; + + ret = xt_check_match_common(par, size, proto, inv_proto); + if (ret < 0) + return ret; + + ret = xt_check_hooks_match(par); + if (ret < 0) + return ret; + + return xt_checkentry_match(par); +} EXPORT_SYMBOL_GPL(xt_check_match); /** xt_check_entry_match - check that matches end before start of target @@ -1012,11 +1047,9 @@ bool xt_find_jump_offset(const unsigned int *offsets, } EXPORT_SYMBOL(xt_find_jump_offset); -int xt_check_target(struct xt_tgchk_param *par, - unsigned int size, u16 proto, bool inv_proto) +static int xt_check_target_common(struct xt_tgchk_param *par, + unsigned int size, u16 proto, bool inv_proto) { - int ret; - if (XT_ALIGN(par->target->targetsize) != size) { pr_err_ratelimited("%s_tables: %s.%u target: invalid size %u (kernel) != (user) %u\n", xt_prefix[par->family], par->target->name, @@ -1061,6 +1094,23 @@ int xt_check_target(struct xt_tgchk_param *par, par->target->proto); return -EINVAL; } + + return 0; +} + +int xt_check_hooks_target(struct xt_tgchk_param *par) +{ + if (par->target->check_hooks != NULL) + return par->target->check_hooks(par); + + return 0; +} +EXPORT_SYMBOL_GPL(xt_check_hooks_target); + +static int xt_checkentry_target(struct xt_tgchk_param *par) +{ + int ret; + if (par->target->checkentry != NULL) { ret = par->target->checkentry(par); if (ret < 0) @@ -1071,6 +1121,22 @@ int xt_check_target(struct xt_tgchk_param *par, } return 0; } + +int xt_check_target(struct xt_tgchk_param *par, + unsigned int size, u16 proto, bool inv_proto) +{ + int ret; + + ret = xt_check_target_common(par, size, proto, inv_proto); + if (ret < 0) + return ret; + + ret = xt_check_hooks_target(par); + if (ret < 0) + return ret; + + return xt_checkentry_target(par); +} EXPORT_SYMBOL_GPL(xt_check_target); /** @@ -1409,11 +1475,9 @@ struct xt_counters *xt_counters_alloc(unsigned int counters) } EXPORT_SYMBOL(xt_counters_alloc); -struct xt_table_info * -xt_replace_table(struct xt_table *table, - unsigned int num_counters, - struct xt_table_info *newinfo, - int *error) +static struct xt_table_info * +do_replace_table(struct xt_table *table, unsigned int num_counters, + struct xt_table_info *newinfo, int *error) { struct xt_table_info *private; unsigned int cpu; @@ -1468,30 +1532,54 @@ xt_replace_table(struct xt_table *table, } } - audit_log_nfcfg(table->name, table->af, private->number, - !private->number ? AUDIT_XT_OP_REGISTER : - AUDIT_XT_OP_REPLACE, - GFP_KERNEL); + return private; +} + +struct xt_table_info * +xt_replace_table(struct xt_table *table, unsigned int num_counters, + struct xt_table_info *newinfo, + int *error) +{ + struct xt_table_info *private; + + private = do_replace_table(table, num_counters, newinfo, error); + if (private) + audit_log_nfcfg(table->name, table->af, private->number, + AUDIT_XT_OP_REPLACE, + GFP_KERNEL); + return private; } EXPORT_SYMBOL_GPL(xt_replace_table); struct xt_table *xt_register_table(struct net *net, const struct xt_table *input_table, + const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t, *table = NULL; + struct nf_hook_ops *ops = NULL; struct xt_table_info *private; - struct xt_table *t, *table; - int ret; + unsigned int num_ops; + int ret = -EINVAL; + + num_ops = hweight32(input_table->valid_hooks); + if (num_ops == 0) + goto out; + + ret = -ENOMEM; + if (template_ops) { + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); + if (!ops) + goto out; + } /* Don't add one object to multiple lists. */ table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); - if (!table) { - ret = -ENOMEM; + if (!table) goto out; - } mutex_lock(&xt[table->af].mutex); /* Don't autoload: we'd eat our tail... */ @@ -1505,7 +1593,7 @@ struct xt_table *xt_register_table(struct net *net, /* Simplifies replace_table code. */ table->private = bootstrap; - if (!xt_replace_table(table, 0, newinfo, &ret)) + if (!do_replace_table(table, 0, newinfo, &ret)) goto unlock; private = table->private; @@ -1514,34 +1602,122 @@ struct xt_table *xt_register_table(struct net *net, /* save number of initial entries */ private->initial_entries = private->number; + if (ops) { + int i; + + for (i = 0; i < num_ops; i++) + ops[i].priv = table; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) { + mutex_unlock(&xt[table->af].mutex); + /* nf_register_net_hooks() might have published a + * base chain before internal error unwind. + */ + synchronize_rcu(); + goto out; + } + + table->ops = ops; + } + + audit_log_nfcfg(table->name, table->af, private->number, + AUDIT_XT_OP_REGISTER, GFP_KERNEL); + list_add(&table->list, &xt_net->tables[table->af]); mutex_unlock(&xt[table->af].mutex); return table; unlock: mutex_unlock(&xt[table->af].mutex); - kfree(table); out: + kfree(table); + kfree(ops); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(xt_register_table); -void *xt_unregister_table(struct xt_table *table) +/** + * xt_unregister_table_pre_exit - pre-shutdown unregister of a table + * @net: network namespace + * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6) + * @name: name of the table to unregister + * + * Unregisters the specified netfilter table from the given network namespace + * and also unregisters the hooks from netfilter core: no new packets will be + * processed. + * + * This must be called prior to xt_unregister_table_exit() from the pernet + * .pre_exit callback. After this call, the table is no longer visible to + * the get/setsockopt path. In case of rmmod, module exit path must have + * called xt_unregister_template() prior to unregistering pernet ops to + * prevent re-instantiation of the table. + * + * See also: xt_unregister_table_exit() + */ +void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) { - struct xt_table_info *private; + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t; - mutex_lock(&xt[table->af].mutex); - private = table->private; - list_del(&table->list); - mutex_unlock(&xt[table->af].mutex); - audit_log_nfcfg(table->name, table->af, private->number, - AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); - kfree(table->ops); - kfree(table); + mutex_lock(&xt[af].mutex); + list_for_each_entry(t, &xt_net->tables[af], list) { + if (strcmp(t->name, name) == 0) { + list_move(&t->list, &xt_net->dead_tables[af]); + mutex_unlock(&xt[af].mutex); - return private; + if (t->ops) /* nat table registers with nat core, t->ops is NULL. */ + nf_unregister_net_hooks(net, t->ops, hweight32(t->valid_hooks)); + return; + } + } + mutex_unlock(&xt[af].mutex); +} +EXPORT_SYMBOL(xt_unregister_table_pre_exit); + +/** + * xt_unregister_table_exit - remove a table during namespace teardown + * @net: the network namespace from which to unregister the table + * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6) + * @name: name of the table to unregister + * + * Completes the unregister process for a table. This must be called from + * the pernet ops .exit callback. This is the second stage after + * xt_unregister_table_pre_exit(). + * + * pair with xt_unregister_table_pre_exit() during namespace shutdown. + * + * Return: the unregistered table or NULL if the table was never + * instantiated. The caller needs to kfree() the table after it + * has removed the family specific matches/targets. + */ +struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name) +{ + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *table; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(table, &xt_net->dead_tables[af], list) { + struct nf_hook_ops *ops = NULL; + + if (strcmp(table->name, name) != 0) + continue; + + list_del(&table->list); + + audit_log_nfcfg(table->name, table->af, table->private->number, + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + swap(table->ops, ops); + mutex_unlock(&xt[af].mutex); + + kfree(ops); + return table; + } + mutex_unlock(&xt[af].mutex); + + return NULL; } -EXPORT_SYMBOL_GPL(xt_unregister_table); +EXPORT_SYMBOL_GPL(xt_unregister_table_exit); #endif #ifdef CONFIG_PROC_FS @@ -1988,8 +2164,10 @@ static int __net_init xt_net_init(struct net *net) struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) { INIT_LIST_HEAD(&xt_net->tables[i]); + INIT_LIST_HEAD(&xt_net->dead_tables[i]); + } return 0; } @@ -1998,8 +2176,10 @@ static void __net_exit xt_net_exit(struct net *net) struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) { WARN_ON_ONCE(!list_empty(&xt_net->tables[i])); + WARN_ON_ONCE(!list_empty(&xt_net->dead_tables[i])); + } } static struct pernet_operations xt_net_ops = { diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 498f5871c84a..d2aeacf94230 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -354,7 +354,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV4, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v1, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -366,7 +366,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV4, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v2, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -398,7 +398,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV6, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v1, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -410,7 +410,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV6, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v2, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 466da23e36ff..b32d153e3a18 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -91,7 +91,7 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par) if (info->queues_total > 1) { if (info->flags & NFQ_FLAG_CPU_FANOUT) { - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); queue = info->queuenum + cpu % info->queues_total; } else { diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 116a885adb3c..80e1634bc51f 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -247,6 +247,21 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) } #endif +static int tcpmss_tg4_check_hooks(const struct xt_tgchk_param *par) +{ + const struct xt_tcpmss_info *info = par->targinfo; + + if (info->mss == XT_TCPMSS_CLAMP_PMTU && + (par->hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { + pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); + return -EINVAL; + } + + return 0; +} + /* Must specify -p tcp --syn */ static inline bool find_syn_match(const struct xt_entry_match *m) { @@ -262,17 +277,9 @@ static inline bool find_syn_match(const struct xt_entry_match *m) static int tcpmss_tg4_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = par->targinfo; const struct ipt_entry *e = par->entryinfo; const struct xt_entry_match *ematch; - if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (par->hook_mask & ~((1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING))) != 0) { - pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); - return -EINVAL; - } if (par->nft_compat) return 0; @@ -286,17 +293,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par) #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int tcpmss_tg6_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = par->targinfo; const struct ip6t_entry *e = par->entryinfo; const struct xt_entry_match *ematch; - if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (par->hook_mask & ~((1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING))) != 0) { - pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); - return -EINVAL; - } if (par->nft_compat) return 0; @@ -312,6 +311,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV4, .name = "TCPMSS", + .check_hooks = tcpmss_tg4_check_hooks, .checkentry = tcpmss_tg4_check, .target = tcpmss_tg4, .targetsize = sizeof(struct xt_tcpmss_info), @@ -322,6 +322,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV6, .name = "TCPMSS", + .check_hooks = tcpmss_tg4_check_hooks, .checkentry = tcpmss_tg6_check, .target = tcpmss_tg6, .targetsize = sizeof(struct xt_tcpmss_info), diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index e4bea1d346cf..5f60e7298a1e 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -86,6 +86,9 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info *tgi = par->targinfo; + if (par->fragoff) + return NF_DROP; + return tproxy_tg4(xt_net(par), skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); } @@ -95,6 +98,9 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + if (par->fragoff) + return NF_DROP; + return tproxy_tg4(xt_net(par), skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); } @@ -106,6 +112,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + unsigned short fragoff = 0; struct udphdr _hdr, *hp; struct sock *sk; const struct in6_addr *laddr; @@ -113,8 +120,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) int thoff = 0; int tproto; - tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); - if (tproto < 0) + tproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); + if (tproto < 0 || fragoff) return NF_DROP; hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index a77088943107..913dbe3aa5e2 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -153,14 +153,10 @@ addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) return ret; } -static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +static int addrtype_mt_check_hooks(const struct xt_mtchk_param *par) { - const char *errmsg = "both incoming and outgoing interface limitation cannot be selected"; struct xt_addrtype_info_v1 *info = par->matchinfo; - - if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && - info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) - goto err; + const char *errmsg; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && @@ -176,6 +172,21 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) goto err; } + return 0; +err: + pr_info_ratelimited("%s\n", errmsg); + return -EINVAL; +} + +static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +{ + const char *errmsg = "both incoming and outgoing interface limitation cannot be selected"; + struct xt_addrtype_info_v1 *info = par->matchinfo; + + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) + goto err; + #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) if (par->family == NFPROTO_IPV6) { if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { @@ -211,6 +222,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .revision = 1, .match = addrtype_mt_v1, + .check_hooks = addrtype_mt_check_hooks, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE @@ -221,6 +233,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .revision = 1, .match = addrtype_mt_v1, + .check_hooks = addrtype_mt_check_hooks, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c index 3bdc302a0f91..9cb259902a58 100644 --- a/net/netfilter/xt_cpu.c +++ b/net/netfilter/xt_cpu.c @@ -34,7 +34,7 @@ static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cpu_info *info = par->matchinfo; - return (info->cpu == smp_processor_id()) ^ info->invert; + return (info->cpu == raw_smp_processor_id()) ^ info->invert; } static struct xt_match cpu_mt_reg __read_mostly = { diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c index 9520dd00070b..6d1a44ab5eee 100644 --- a/net/netfilter/xt_devgroup.c +++ b/net/netfilter/xt_devgroup.c @@ -33,14 +33,10 @@ static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) return true; } -static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +static int devgroup_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_devgroup_info *info = par->matchinfo; - if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | - XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) - return -EINVAL; - if (info->flags & XT_DEVGROUP_MATCH_SRC && par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | @@ -56,9 +52,21 @@ static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) return 0; } +static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +{ + const struct xt_devgroup_info *info = par->matchinfo; + + if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | + XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) + return -EINVAL; + + return 0; +} + static struct xt_match devgroup_mt_reg __read_mostly = { .name = "devgroup", .match = devgroup_mt, + .check_hooks = devgroup_mt_check_hooks, .checkentry = devgroup_mt_checkentry, .matchsize = sizeof(struct xt_devgroup_info), .family = NFPROTO_UNSPEC, diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c index b96e8203ac54..a8503f5d26bf 100644 --- a/net/netfilter/xt_ecn.c +++ b/net/netfilter/xt_ecn.c @@ -30,6 +30,10 @@ static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par) struct tcphdr _tcph; const struct tcphdr *th; + /* this is fine for IPv6 as ecn_mt_check6() enforces -p tcp */ + if (par->fragoff) + return false; + /* In practice, TCP match does this, so can't fail. But let's * be good citizens. */ diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 3bd127bfc114..2704b4b60d1e 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -658,6 +658,8 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, if (!(hinfo->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) return 0; + if (ntohs(ip_hdr(skb)->frag_off) & IP_OFFSET) + return -1; nexthdr = ip_hdr(skb)->protocol; break; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) @@ -681,7 +683,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, return 0; nexthdr = ipv6_hdr(skb)->nexthdr; protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if ((int)protoff < 0) + if ((int)protoff < 0 || ntohs(frag_off) & IP6_OFFSET) return -1; break; } diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index dc9485854002..e8807caede68 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -27,6 +27,9 @@ static bool xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) { + if (p->fragoff) + return false; + return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p), xt_out(p), p->matchinfo, xt_net(p), nf_osf_fingers); } diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index d2b0b52434fa..dd98f758176c 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -91,14 +91,10 @@ match_outdev: return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT)); } -static int physdev_mt_check(const struct xt_mtchk_param *par) +static int physdev_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_physdev_info *info = par->matchinfo; - static bool brnf_probed __read_mostly; - if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || - info->bitmask & ~XT_PHYSDEV_OP_MASK) - return -EINVAL; if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && @@ -107,6 +103,18 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) return -EINVAL; } + return 0; +} + +static int physdev_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_physdev_info *info = par->matchinfo; + static bool brnf_probed __read_mostly; + + if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || + info->bitmask & ~XT_PHYSDEV_OP_MASK) + return -EINVAL; + #define X(memb) strnlen(info->memb, sizeof(info->memb)) >= sizeof(info->memb) if (info->bitmask & XT_PHYSDEV_OP_IN) { if (info->physindev[0] == '\0') @@ -141,6 +149,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = { { .name = "physdev", .family = NFPROTO_IPV4, + .check_hooks = physdev_mt_check_hooks, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), @@ -149,6 +158,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = { { .name = "physdev", .family = NFPROTO_IPV6, + .check_hooks = physdev_mt_check_hooks, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index b5fa65558318..ff54e3a8581e 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -126,13 +126,10 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par) return ret; } -static int policy_mt_check(const struct xt_mtchk_param *par) +static int policy_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_policy_info *info = par->matchinfo; - const char *errmsg = "neither incoming nor outgoing policy selected"; - - if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) - goto err; + const char *errmsg; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { @@ -144,6 +141,21 @@ static int policy_mt_check(const struct xt_mtchk_param *par) errmsg = "input policy not valid in POSTROUTING and OUTPUT"; goto err; } + + return 0; +err: + pr_info_ratelimited("%s\n", errmsg); + return -EINVAL; +} + +static int policy_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_policy_info *info = par->matchinfo; + const char *errmsg = "neither incoming nor outgoing policy selected"; + + if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) + goto err; + if (info->len > XT_POLICY_MAX_ELEM) { errmsg = "too many policy elements"; goto err; @@ -158,6 +170,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = NFPROTO_IPV4, + .check_hooks = policy_mt_check_hooks, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), @@ -166,6 +179,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = NFPROTO_IPV6, + .check_hooks = policy_mt_check_hooks, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 731bc2cafae4..4ae04bba9358 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -431,6 +431,29 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) } static int +set_target_v3_check_hooks(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + + if (info->map_set.index != IPSET_INVALID_ID) { + if (strncmp(par->table, "mangle", 7)) { + pr_info_ratelimited("--map-set only usable from mangle table\n"); + return -EINVAL; + } + if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | + (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && + (par->hook_mask & ~(1 << NF_INET_FORWARD | + 1 << NF_INET_LOCAL_OUT | + 1 << NF_INET_POST_ROUTING))) { + pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); + return -EINVAL; + } + } + + return 0; +} + +static int set_target_v3_checkentry(const struct xt_tgchk_param *par) { const struct xt_set_info_target_v3 *info = par->targinfo; @@ -459,20 +482,6 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) } if (info->map_set.index != IPSET_INVALID_ID) { - if (strncmp(par->table, "mangle", 7)) { - pr_info_ratelimited("--map-set only usable from mangle table\n"); - ret = -EINVAL; - goto cleanup_del; - } - if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | - (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && - (par->hook_mask & ~(1 << NF_INET_FORWARD | - 1 << NF_INET_LOCAL_OUT | - 1 << NF_INET_POST_ROUTING))) { - pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); - ret = -EINVAL; - goto cleanup_del; - } index = ip_set_nfnl_get_byindex(par->net, info->map_set.index); if (index == IPSET_INVALID_ID) { @@ -672,6 +681,7 @@ static struct xt_target set_targets[] __read_mostly = { .family = NFPROTO_IPV4, .target = set_target_v3, .targetsize = sizeof(struct xt_set_info_target_v3), + .check_hooks = set_target_v3_check_hooks, .checkentry = set_target_v3_checkentry, .destroy = set_target_v3_destroy, .me = THIS_MODULE @@ -682,6 +692,7 @@ static struct xt_target set_targets[] __read_mostly = { .family = NFPROTO_IPV6, .target = set_target_v3, .targetsize = sizeof(struct xt_set_info_target_v3), + .check_hooks = set_target_v3_check_hooks, .checkentry = set_target_v3_checkentry, .destroy = set_target_v3_destroy, .me = THIS_MODULE diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 0d32d4841cb3..b9da8269161d 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -32,6 +32,10 @@ tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par) u8 _opt[15 * 4 - sizeof(_tcph)]; unsigned int i, optlen; + /* this is fine for IPv6 as xt_tcpmss enforces -p tcp */ + if (par->fragoff) + return false; + /* If we don't have the whole header, drop packet. */ th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2aeb0680807d..7269e23b578d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1482,9 +1482,14 @@ static void do_one_broadcast(struct sock *sk, p->skb2 = NULL; goto out; } - NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); - if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) - NETLINK_CB(p->skb2).nsid_is_set = true; + + NETLINK_CB(p->skb2).nsid_is_set = false; + if (!net_eq(sock_net(sk), p->net)) { + NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); + if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) + NETLINK_CB(p->skb2).nsid_is_set = true; + } + val = netlink_broadcast_deliver(sk, p->skb2); if (val < 0) { netlink_overrun(sk); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index d251d894afd4..0da39eaed255 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1972,8 +1972,10 @@ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + kfree_skb(skb); return -EINVAL; + } group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group); @@ -1986,8 +1988,10 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + kfree_skb(skb); return; + } group = family->mcgrp_offset + group; nlmsg_notify(sk, skb, info->snd_portid, group, diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 0d33c81a15fe..ba6f0310ffd7 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -861,6 +861,11 @@ static void nfc_hci_recv_from_llc(struct nfc_hci_dev *hdev, struct sk_buff *skb) struct sk_buff *frag_skb; int msg_len; + if (!pskb_may_pull(skb, NFC_HCI_HCP_PACKET_HEADER_LEN)) { + kfree_skb(skb); + return; + } + packet = (struct hcp_packet *)skb->data; if ((packet->header & ~NFC_HCI_FRAGMENT) == 0) { skb_queue_tail(&hdev->rx_hcp_frags, skb); @@ -904,6 +909,11 @@ static void nfc_hci_recv_from_llc(struct nfc_hci_dev *hdev, struct sk_buff *skb) * unblock waiting cmd context. Otherwise, enqueue to dispatch * in separate context where handler can also execute command. */ + if (!pskb_may_pull(hcp_skb, NFC_HCI_HCP_HEADER_LEN)) { + kfree_skb(hcp_skb); + return; + } + packet = (struct hcp_packet *)hcp_skb->data; type = HCP_MSG_GET_TYPE(packet->message.header); if (type == NFC_HCI_HCP_RESPONSE) { diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index db5bc6a878dd..dc65c719f35f 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -1218,6 +1218,15 @@ static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, sk = &llcp_sock->sk; + lock_sock(sk); + + /* Check if socket was destroyed whilst waiting for the lock */ + if (!sk_hashed(sk)) { + release_sock(sk); + nfc_llcp_sock_put(llcp_sock); + return; + } + /* Unlink from connecting and link to the client array */ nfc_llcp_sock_unlink(&local->connecting_sockets, sk); nfc_llcp_sock_link(&local->sockets, sk); @@ -1229,6 +1238,8 @@ static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, sk->sk_state = LLCP_CONNECTED; sk->sk_state_change(sk); + release_sock(sk); + nfc_llcp_sock_put(llcp_sock); } diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index f1be1e84f665..feab29fc62f4 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -633,6 +633,8 @@ static int llcp_sock_release(struct socket *sock) if (sock->type == SOCK_RAW) nfc_llcp_sock_unlink(&local->raw_sockets, sk); + else if (sk->sk_state == LLCP_CONNECTING) + nfc_llcp_sock_unlink(&local->connecting_sockets, sk); else nfc_llcp_sock_unlink(&local->sockets, sk); diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index 40ae8e5a7ec7..c03e8a0bd3bd 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -439,6 +439,11 @@ void nci_hci_data_received_cb(void *context, return; } + if (!pskb_may_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN)) { + kfree_skb(skb); + return; + } + packet = (struct nci_hcp_packet *)skb->data; if ((packet->header & ~NCI_HCI_FRAGMENT) == 0) { skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb); @@ -482,6 +487,11 @@ void nci_hci_data_received_cb(void *context, * unblock waiting cmd context. Otherwise, enqueue to dispatch * in separate context where handler can also execute command. */ + if (!pskb_may_pull(hcp_skb, NCI_HCI_HCP_HEADER_LEN)) { + kfree_skb(hcp_skb); + return; + } + packet = (struct nci_hcp_packet *)hcp_skb->data; type = NCI_HCP_MSG_GET_TYPE(packet->message.header); if (type == NCI_HCI_HCP_RESPONSE) { diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index b10e1602c6b1..cb5ea4424ffc 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -97,6 +97,9 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) goto error; } + vport->dev = dev; + netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL); + rtnl_unlock(); return vport; error: @@ -111,7 +114,7 @@ static struct vport *geneve_create(const struct vport_parms *parms) if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + return ovs_netdev_link(vport, true); } static struct vport_ops ovs_geneve_vport_ops = { diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 4014c9b5eb79..6cb5a697b396 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -63,6 +63,9 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) return ERR_PTR(err); } + vport->dev = dev; + netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL); + rtnl_unlock(); return vport; } @@ -75,7 +78,7 @@ static struct vport *gre_create(const struct vport_parms *parms) if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + return ovs_netdev_link(vport, true); } static struct vport_ops ovs_gre_vport_ops = { diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 12055af832dc..e7e8490a53d8 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -73,37 +73,29 @@ static struct net_device *get_dpdev(const struct datapath *dp) return local->dev; } -struct vport *ovs_netdev_link(struct vport *vport, const char *name) +struct vport *ovs_netdev_link(struct vport *vport, bool tunnel) { int err; - vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); - if (!vport->dev) { + if (WARN_ON_ONCE(!vport->dev)) { err = -ENODEV; goto error_free_vport; } - /* Ensure that the device exists and that the provided - * name is not one of its aliases. + + rtnl_lock(); + /* Do not link devices that are not registered to avoid a potential + * race with the NETDEV_UNREGISTER notification in dp_device_event(). */ - if (strcmp(name, ovs_vport_name(vport))) { + if (vport->dev->reg_state != NETREG_REGISTERED) { err = -ENODEV; - goto error_put; - } - netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); - if (vport->dev->flags & IFF_LOOPBACK || - (vport->dev->type != ARPHRD_ETHER && - vport->dev->type != ARPHRD_NONE) || - ovs_is_internal_dev(vport->dev)) { - err = -EINVAL; - goto error_put; + goto error_put_unlock; } - rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp), NULL, NULL, NULL); if (err) - goto error_unlock; + goto error_put_unlock; err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); @@ -119,10 +111,11 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) error_master_upper_dev_unlink: netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); -error_unlock: - rtnl_unlock(); -error_put: +error_put_unlock: + if (tunnel && vport->dev->reg_state == NETREG_REGISTERED) + rtnl_delete_link(vport->dev, 0, NULL); netdev_put(vport->dev, &vport->dev_tracker); + rtnl_unlock(); error_free_vport: ovs_vport_free(vport); return ERR_PTR(err); @@ -132,12 +125,39 @@ EXPORT_SYMBOL_GPL(ovs_netdev_link); static struct vport *netdev_create(const struct vport_parms *parms) { struct vport *vport; + int err; vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); + if (!vport->dev) { + err = -ENODEV; + goto error_free_vport; + } + netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); + + /* Ensure that the provided name is not an alias. */ + if (strcmp(parms->name, ovs_vport_name(vport))) { + err = -ENODEV; + goto error_put; + } + + if (vport->dev->flags & IFF_LOOPBACK || + (vport->dev->type != ARPHRD_ETHER && + vport->dev->type != ARPHRD_NONE) || + ovs_is_internal_dev(vport->dev)) { + err = -EINVAL; + goto error_put; + } + + return ovs_netdev_link(vport, false); +error_put: + netdev_put(vport->dev, &vport->dev_tracker); +error_free_vport: + ovs_vport_free(vport); + return ERR_PTR(err); } static void vport_netdev_free(struct rcu_head *rcu) @@ -196,9 +216,13 @@ void ovs_netdev_tunnel_destroy(struct vport *vport) */ if (vport->dev->reg_state == NETREG_REGISTERED) rtnl_delete_link(vport->dev, 0, NULL); - rtnl_unlock(); + /* We can't put the device reference yet, since it can still be in + * use, but rtnl_unlock()->netdev_run_todo() will block until all + * the references are released, so the RCU call must be before it. + */ call_rcu(&vport->rcu, vport_netdev_free); + rtnl_unlock(); } EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index c5d83a43bfc4..6c0d7366f986 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -13,7 +13,7 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -struct vport *ovs_netdev_link(struct vport *vport, const char *name); +struct vport *ovs_netdev_link(struct vport *vport, bool tunnel); void ovs_netdev_detach_dev(struct vport *); int __init ovs_netdev_init(void); diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 0b881b043bcf..c1b37b50d29e 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -126,6 +126,9 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) goto error; } + vport->dev = dev; + netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL); + rtnl_unlock(); return vport; error: @@ -140,7 +143,7 @@ static struct vport *vxlan_create(const struct vport_parms *parms) if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + return ovs_netdev_link(vport, true); } static struct vport_ops ovs_vxlan_netdev_vport_ops = { diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 4dbf0914df7d..706927139393 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -671,8 +671,23 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb) /* Look for an existing pipe handle */ sknode = pep_find_pipe(&pn->hlist, &dst, pipe_handle); - if (sknode) - return sk_receive_skb(sknode, skb, 1); + if (sknode) { + int rc; + + /* pep_do_rcv() runs from two contexts: from softirq via + * phonet_rcv() -> __sk_receive_skb() with BH disabled, + * and from process context via + * release_sock() -> __release_sock(), which drops + * the listener slock with spin_unlock_bh() before draining + * the backlog. The child pipe slock is taken below via + * bh_lock_sock_nested(), which does not itself disable BH, so + * disable BH here to keep both acquire contexts consistent. + */ + local_bh_disable(); + rc = sk_receive_skb(sknode, skb, 1); + local_bh_enable(); + return rc; + } switch (hdr->message_id) { case PNS_PEP_CONNECT_REQ: diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index 9508b6c38003..e45549f08eef 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -263,15 +263,16 @@ EXPORT_SYMBOL(psp_dev_encapsulate); /* Receive handler for PSP packets. * - * Presently it accepts only already-authenticated packets and does not - * support optional fields, such as virtualization cookies. The caller should - * ensure that skb->data is pointing to the mac header, and that skb->mac_len - * is set. This function does not currently adjust skb->csum (CHECKSUM_COMPLETE - * is not supported). + * Accepts only already-authenticated packets. The full PSP header is + * stripped according to psph->hdrlen; any optional fields it advertises + * (virtualization cookies, etc.) are ignored and discarded along with the + * rest of the header. The caller should ensure that skb->data is pointing + * to the mac header, and that skb->mac_len is set. This function does not + * currently adjust skb->csum (CHECKSUM_COMPLETE is not supported). */ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) { - int l2_hlen = 0, l3_hlen, encap; + int l2_hlen = 0, l3_hlen, encap, psp_hlen; struct psp_skb_ext *pse; struct psphdr *psph; struct ethhdr *eth; @@ -312,18 +313,36 @@ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT))) return -EINVAL; - pse = skb_ext_add(skb, SKB_EXT_PSP); - if (!pse) + psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + + sizeof(struct udphdr)); + + /* Strip the full PSP header per psph->hdrlen; VC/options are pulled + * into the linear region only so they can be discarded with the + * rest of the header. + */ + psp_hlen = (psph->hdrlen + 1) * 8; + + if (unlikely(psp_hlen < sizeof(struct psphdr))) + return -EINVAL; + + if (psp_hlen > sizeof(struct psphdr) && + !pskb_may_pull(skb, l2_hlen + l3_hlen + + sizeof(struct udphdr) + psp_hlen)) return -EINVAL; psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + sizeof(struct udphdr)); + + pse = skb_ext_add(skb, SKB_EXT_PSP); + if (!pse) + return -EINVAL; + pse->spi = psph->spi; pse->dev_id = dev_id; pse->generation = generation; pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl); - encap = PSP_ENCAP_HLEN; + encap = sizeof(struct udphdr) + psp_hlen; encap += strip_icv ? PSP_TRL_SIZE : 0; if (proto == htons(ETH_P_IP)) { @@ -340,8 +359,9 @@ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap); } - memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen); - skb_pull(skb, PSP_ENCAP_HLEN); + memmove(skb->data + sizeof(struct udphdr) + psp_hlen, + skb->data, l2_hlen + l3_hlen); + skb_pull(skb, sizeof(struct udphdr) + psp_hlen); if (strip_icv) pskb_trim(skb, skb->len - PSP_TRL_SIZE); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 0c64c504f79d..4001de0c4959 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -656,6 +656,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) sends_out: vfree(ic->i_sends); + ic->i_sends = NULL; ack_dma_out: rds_dma_hdr_free(rds_ibdev->dev, ic->i_ack, ic->i_ack_dma, diff --git a/net/rds/message.c b/net/rds/message.c index eaa6f22601a4..7feb0eb6537d 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -131,24 +131,34 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, */ static void rds_message_purge(struct rds_message *rm) { + struct rds_znotifier *znotifier; unsigned long i, flags; - bool zcopy = false; + bool zcopy; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; spin_lock_irqsave(&rm->m_rs_lock, flags); + znotifier = rm->data.op_mmp_znotifier; + rm->data.op_mmp_znotifier = NULL; + zcopy = !!znotifier; + if (rm->m_rs) { struct rds_sock *rs = rm->m_rs; - if (rm->data.op_mmp_znotifier) { - zcopy = true; - rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + if (znotifier) { + rds_rm_zerocopy_callback(rs, znotifier); rds_wake_sk_sleep(rs); - rm->data.op_mmp_znotifier = NULL; } sock_put(rds_rs_to_sk(rs)); rm->m_rs = NULL; + } else if (znotifier) { + /* + * Zerocopy can fail before the message is queued on the + * socket, so there is no rs to carry the notification. + */ + mm_unaccount_pinned_pages(&znotifier->z_mmp); + kfree(rds_info_from_znotifier(znotifier)); } spin_unlock_irqrestore(&rm->m_rs_lock, flags); @@ -438,6 +448,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter * for (i = 0; i < rm->data.op_nents; i++) put_page(sg_page(&rm->data.op_sg[i])); + rm->data.op_nents = 0; mmp = &rm->data.op_mmp_znotifier->z_mmp; mm_unaccount_pinned_pages(mmp); ret = -EFAULT; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 654e23d13e3d..5830b31a1f37 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -198,8 +198,13 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc); write_lock_bh(&sock->sk->sk_callback_lock); - /* done under the callback_lock to serialize with write_space */ + /* done under the callback_lock to serialize with write_space. + * Set t_sock inside rds_tcp_tc_list_lock so readers walking + * rds_tcp_tc_list under the same lock cannot observe an + * entry whose t_sock is NULL. + */ spin_lock(&rds_tcp_tc_list_lock); + tc->t_sock = sock; list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); #if IS_ENABLED(CONFIG_IPV6) rds6_tcp_tc_count++; @@ -211,8 +216,6 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) /* accepted sockets need our listen data ready undone */ if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready) sock->sk->sk_data_ready = sock->sk->sk_user_data; - - tc->t_sock = sock; if (!tc->t_rtn) tc->t_rtn = net_generic(sock_net(sock->sk), rds_tcp_netid); tc->t_cpath = cp; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 27c2aa2dd023..98f2165159d7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -213,8 +213,6 @@ struct rxrpc_skb_priv { struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ - u8 flags; -#define RXRPC_RX_VERIFIED 0x01 }; struct { rxrpc_seq_t first_ack; /* First packet in acks table */ @@ -309,15 +307,16 @@ struct rxrpc_security { struct sk_buff *challenge); /* verify a response */ - int (*verify_response)(struct rxrpc_connection *, - struct sk_buff *); + int (*verify_response)(struct rxrpc_connection *conn, + struct sk_buff *response_skb, + void *response, unsigned int len); /* clear connection security */ void (*clear)(struct rxrpc_connection *); /* Default ticket -> key decoder */ int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb, - unsigned int ticket_offset, unsigned int ticket_len, + void *ticket, unsigned int ticket_len, struct key **_key); }; @@ -774,6 +773,11 @@ struct rxrpc_call { struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */ struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */ + void *rx_dec_buffer; /* Decryption buffer */ + unsigned short rx_dec_bsize; /* rx_dec_buffer size */ + unsigned short rx_dec_offset; /* Decrypted packet data offset */ + unsigned short rx_dec_len; /* Decrypted packet data len */ + rxrpc_seq_t rx_dec_seq; /* Packet in decryption buffer */ rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */ rxrpc_seq_t rx_consumed; /* Highest packet consumed */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index fdd683261226..fec59d9338b9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -332,25 +332,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK; - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && - sp->hdr.securityIndex != 0 && - skb_cloned(skb)) { - /* Unshare the packet so that it can be - * modified by in-place decryption. - */ - struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); - - if (nskb) { - rxrpc_new_skb(nskb, rxrpc_skb_new_unshared); - rxrpc_input_call_packet(call, nskb); - rxrpc_free_skb(nskb, rxrpc_skb_put_call_rx); - } else { - /* OOM - Drop the packet. */ - rxrpc_see_skb(skb, rxrpc_skb_see_unshare_nomem); - } - } else { - rxrpc_input_call_packet(call, skb); - } + rxrpc_input_call_packet(call, skb); rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); did_receive = true; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f035f486c139..fcb9d38bb521 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -152,6 +152,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, spin_lock_init(&call->notify_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; + call->rx_pkt_offset = USHRT_MAX; call->tx_total_len = -1; call->tx_jumbo_max = 1; call->next_rx_timo = 20 * HZ; @@ -553,6 +554,7 @@ static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call) rxrpc_purge_queue(&call->recvmsg_queue); rxrpc_purge_queue(&call->rx_queue); rxrpc_purge_queue(&call->rx_oos_queue); + kfree(call->rx_dec_buffer); } /* diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index a2130d25aaa9..c96ca615b787 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -243,27 +243,22 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call) static int rxrpc_verify_response(struct rxrpc_connection *conn, struct sk_buff *skb) { + unsigned int len = skb->len - sizeof(struct rxrpc_wire_header); + void *buffer; int ret; - if (skb_cloned(skb)) { - /* Copy the packet if shared so that we can do in-place - * decryption. - */ - struct sk_buff *nskb = skb_copy(skb, GFP_NOFS); + buffer = kmalloc(len, GFP_NOFS); + if (!buffer) + return -ENOMEM; - if (nskb) { - rxrpc_new_skb(nskb, rxrpc_skb_new_unshared); - ret = conn->security->verify_response(conn, nskb); - rxrpc_free_skb(nskb, rxrpc_skb_put_response_copy); - } else { - /* OOM - Drop the packet. */ - rxrpc_see_skb(skb, rxrpc_skb_see_unshare_nomem); - ret = -ENOMEM; - } - } else { - ret = conn->security->verify_response(conn, skb); - } + ret = skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), buffer, len); + if (ret < 0) + goto out; + + ret = conn->security->verify_response(conn, skb, buffer, len); +out: + kfree(buffer); return ret; } diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 0a260df45d25..0b39046bdc61 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -32,9 +32,6 @@ static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - sp->flags |= RXRPC_RX_VERIFIED; return 0; } @@ -57,9 +54,10 @@ static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge, } static int none_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb) + struct sk_buff *response_skb, + void *response, unsigned int len) { - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + return rxrpc_abort_conn(conn, response_skb, RX_PROTOCOL_ERROR, -EPROTO, rxrpc_eproto_rxnull_response); } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index e1f7513a46db..c940600117a4 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -147,15 +147,52 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) } /* - * Decrypt and verify a DATA packet. + * Decrypt and verify a DATA packet. The content of the packet is pulled out + * into a flat buffer rather than decrypting in place in the skbuff. This also + * has the advantage of aligning the buffer correctly for the crypto routines. + * + * We keep track of the sequence number of the packet currently decrypted into + * the buffer in ->rx_dec_seq. If MSG_PEEK is used and steps onto a new + * packet, subsequent recvmsg() calls will have to go back and re-decrypt the + * current packet. */ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + int ret; - if (sp->flags & RXRPC_RX_VERIFIED) - return 0; - return call->security->verify_packet(call, skb); + if (sp->len > call->rx_dec_bsize) { + /* Make sure we can hold a 1412-byte jumbo subpacket and make + * sure that the buffer size is aligned to a crypto blocksize. + */ + size_t size = clamp(round_up(sp->len, 32), 2048, 65535); + void *buffer = krealloc(call->rx_dec_buffer, size, GFP_NOFS); + + if (!buffer) + return -ENOMEM; + call->rx_dec_buffer = buffer; + call->rx_dec_bsize = size; + } + + ret = -EFAULT; + if (skb_copy_bits(skb, sp->offset, call->rx_dec_buffer, sp->len) < 0) + goto err; + + call->rx_dec_offset = 0; + call->rx_dec_len = sp->len; + call->rx_dec_seq = sp->hdr.seq; + ret = call->security->verify_packet(call, skb); + if (ret < 0) + goto err; + return 0; + +err: + kfree(call->rx_dec_buffer); + call->rx_dec_buffer = NULL; + call->rx_dec_bsize = 0; + call->rx_dec_offset = 0; + call->rx_dec_len = 0; + return ret; } /* @@ -283,16 +320,21 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (msg) sock_recv_timestamp(msg, sock->sk, skb); - if (rx_pkt_offset == 0) { + if (call->rx_dec_seq != sp->hdr.seq || + !call->rx_dec_buffer) { ret2 = rxrpc_verify_data(call, skb); trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, - sp->offset, sp->len, ret2); + call->rx_dec_offset, + call->rx_dec_len, ret2); if (ret2 < 0) { ret = ret2; goto out; } - rx_pkt_offset = sp->offset; - rx_pkt_len = sp->len; + } + + if (rx_pkt_offset == USHRT_MAX) { + rx_pkt_offset = call->rx_dec_offset; + rx_pkt_len = call->rx_dec_len; } else { trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq, rx_pkt_offset, rx_pkt_len, 0); @@ -304,10 +346,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (copy > remain) copy = remain; if (copy > 0) { - ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter, - copy); - if (ret2 < 0) { - ret = ret2; + ret2 = copy_to_iter(call->rx_dec_buffer + rx_pkt_offset, + copy, iter); + if (ret2 != copy) { + ret = -EFAULT; goto out; } @@ -328,7 +370,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, /* The whole packet has been transferred. */ if (sp->hdr.flags & RXRPC_LAST_PACKET) ret = 1; - rx_pkt_offset = 0; + rx_pkt_offset = USHRT_MAX; rx_pkt_len = 0; skb = skb_peek_next(skb, &call->recvmsg_queue); diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 0d5e654da918..a1ee102abae1 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -473,15 +473,20 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxgk_header *hdr; struct krb5_buffer metadata; - unsigned int offset = sp->offset, len = sp->len; + unsigned int len = call->rx_dec_len; size_t data_offset = 0, data_len = len; + void *data = call->rx_dec_buffer, *p = data; u32 ac = 0; int ret = -ENOMEM; _enter(""); - crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, - &data_offset, &data_len); + if (crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, + &data_offset, &data_len) < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_1_short_header); + goto put_gk; + } hdr = kzalloc_obj(*hdr, GFP_NOFS); if (!hdr) @@ -496,16 +501,15 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, metadata.len = sizeof(*hdr); metadata.data = hdr; - ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, - skb, &offset, &len, &ac); + ret = rxgk_verify_mic(gk->krb5, gk->rx_Kc, &metadata, &p, &len, &ac); kfree(hdr); if (ret < 0) { if (ret != -ENOMEM) rxrpc_abort_eproto(call, skb, ac, rxgk_abort_1_verify_mic_eproto); } else { - sp->offset = offset; - sp->len = len; + call->rx_dec_offset = p - data; + call->rx_dec_len = len; } put_gk: @@ -522,49 +526,53 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxgk_header hdr; - unsigned int offset = sp->offset, len = sp->len; + struct rxgk_header *hdr; + unsigned int offset = 0, len = call->rx_dec_len; + void *data = call->rx_dec_buffer, *p = data; int ret; u32 ac = 0; _enter(""); - ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); + if (crypto_krb5_check_data_len(gk->krb5, KRB5_ENCRYPT_MODE, + len, sizeof(*hdr)) < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_header); + goto error; + } + + ret = rxgk_decrypt(gk->krb5, gk->rx_enc, &p, &len, &ac); if (ret < 0) { if (ret != -ENOMEM) rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); goto error; } + offset = p - data; - if (len < sizeof(hdr)) { + if (len < sizeof(*hdr)) { ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, rxgk_abort_2_short_header); goto error; } /* Extract the header from the skb */ - ret = skb_copy_bits(skb, offset, &hdr, sizeof(hdr)); - if (ret < 0) { - ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, - rxgk_abort_2_short_encdata); - goto error; - } - offset += sizeof(hdr); - len -= sizeof(hdr); - - if (ntohl(hdr.epoch) != call->conn->proto.epoch || - ntohl(hdr.cid) != call->cid || - ntohl(hdr.call_number) != call->call_id || - ntohl(hdr.seq) != sp->hdr.seq || - ntohl(hdr.sec_index) != call->security_ix || - ntohl(hdr.data_len) > len) { + hdr = data + offset; + offset += sizeof(*hdr); + len -= sizeof(*hdr); + + if (ntohl(hdr->epoch) != call->conn->proto.epoch || + ntohl(hdr->cid) != call->cid || + ntohl(hdr->call_number) != call->call_id || + ntohl(hdr->seq) != sp->hdr.seq || + ntohl(hdr->sec_index) != call->security_ix || + ntohl(hdr->data_len) > len) { ret = rxrpc_abort_eproto(call, skb, RXGK_SEALEDINCON, rxgk_abort_2_short_data); goto error; } - sp->offset = offset; - sp->len = ntohl(hdr.data_len); + call->rx_dec_offset = offset; + call->rx_dec_len = ntohl(hdr->data_len); ret = 0; error: rxgk_put(gk); @@ -1076,11 +1084,12 @@ static int rxgk_sendmsg_respond_to_challenge(struct sk_buff *challenge, * unsigned int call_numbers<>; * }; */ -static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, - const struct krb5_enctype *krb5, - struct sk_buff *skb, - __be32 *p, __be32 *end) +static int rxgk_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + void *auth, unsigned int auth_len) { + __be32 *p = auth, *end = auth + auth_len; u32 app_len, call_count, level, epoch, cid, i; _enter(""); @@ -1144,37 +1153,6 @@ static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, } /* - * Extract the authenticator and verify it. - */ -static int rxgk_verify_authenticator(struct rxrpc_connection *conn, - const struct krb5_enctype *krb5, - struct sk_buff *skb, - unsigned int auth_offset, unsigned int auth_len) -{ - void *auth; - __be32 *p; - int ret; - - auth = kmalloc(auth_len, GFP_NOFS); - if (!auth) - return -ENOMEM; - - ret = skb_copy_bits(skb, auth_offset, auth, auth_len); - if (ret < 0) { - ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, - rxgk_abort_resp_short_auth); - goto error; - } - - p = auth; - ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, - p + auth_len / sizeof(*p)); -error: - kfree(auth); - return ret; -} - -/* * Verify a response. * * struct RXGK_Response { @@ -1184,49 +1162,45 @@ error: * }; */ static int rxgk_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb) + struct sk_buff *skb, + void *buffer, unsigned int len) { const struct krb5_enctype *krb5; struct rxrpc_key_token *token; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxgk_response rhdr; + struct rxgk_response *rhdr; struct rxgk_context *gk; struct key *key = NULL; - unsigned int offset = sizeof(struct rxrpc_wire_header); - unsigned int len = skb->len - sizeof(struct rxrpc_wire_header); - unsigned int token_offset, token_len; - unsigned int auth_offset, auth_len; + unsigned int resp_token_len, auth_len; + void *resp_token, *auth; __be32 xauth_len; int ret, ec; _enter("{%d}", conn->debug_id); /* Parse the RXGK_Response object */ - if (sizeof(rhdr) + sizeof(__be32) > len) + if (len < sizeof(*rhdr) + sizeof(__be32)) goto short_packet; - - if (skb_copy_bits(skb, offset, &rhdr, sizeof(rhdr)) < 0) - goto short_packet; - offset += sizeof(rhdr); - len -= sizeof(rhdr); - - token_offset = offset; - token_len = ntohl(rhdr.token_len); - if (token_len > len || - xdr_round_up(token_len) + sizeof(__be32) > len) + rhdr = buffer; + buffer += sizeof(*rhdr); + len -= sizeof(*rhdr); + + resp_token = buffer; + resp_token_len = ntohl(rhdr->token_len); + if (resp_token_len > len || + xdr_round_up(resp_token_len) + sizeof(__be32) > len) goto short_packet; - trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len); + trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, resp_token_len); - offset += xdr_round_up(token_len); - len -= xdr_round_up(token_len); + buffer += xdr_round_up(resp_token_len); + len -= xdr_round_up(resp_token_len); - if (skb_copy_bits(skb, offset, &xauth_len, sizeof(xauth_len)) < 0) - goto short_packet; - offset += sizeof(xauth_len); + xauth_len = *(__be32 *)buffer; + buffer += sizeof(xauth_len); len -= sizeof(xauth_len); - auth_offset = offset; + auth = buffer; auth_len = ntohl(xauth_len); if (auth_len > len) goto short_packet; @@ -1241,7 +1215,7 @@ static int rxgk_verify_response(struct rxrpc_connection *conn, * to the app to deal with - which might mean a round trip to * userspace. */ - ret = rxgk_extract_token(conn, skb, token_offset, token_len, &key); + ret = rxgk_extract_token(conn, skb, resp_token, resp_token_len, &key); if (ret < 0) goto out; @@ -1255,7 +1229,7 @@ static int rxgk_verify_response(struct rxrpc_connection *conn, */ token = key->payload.data[0]; conn->security_level = token->rxgk->level; - conn->rxgk.start_time = __be64_to_cpu(rhdr.start_time); + conn->rxgk.start_time = __be64_to_cpu(rhdr->start_time); gk = rxgk_generate_transport_key(conn, token->rxgk, sp->hdr.cksum, GFP_NOFS); if (IS_ERR(gk)) { @@ -1265,18 +1239,18 @@ static int rxgk_verify_response(struct rxrpc_connection *conn, krb5 = gk->krb5; - trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, token_len); + trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, + resp_token_len); /* Decrypt, parse and verify the authenticator. */ - ret = rxgk_decrypt_skb(krb5, gk->resp_enc, skb, - &auth_offset, &auth_len, &ec); + ret = rxgk_decrypt(krb5, gk->resp_enc, &auth, &auth_len, &ec); if (ret < 0) { rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret, rxgk_abort_resp_auth_dec); goto out_gk; } - ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len); + ret = rxgk_verify_authenticator(conn, krb5, skb, auth, auth_len); if (ret < 0) goto out_gk; diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index 0ef2a29eb695..200a30064fae 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -40,7 +40,7 @@ * }; */ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, - unsigned int ticket_offset, unsigned int ticket_len, + void *buffer, unsigned int ticket_len, struct key **_key) { struct rxrpc_key_token *token; @@ -49,7 +49,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, size_t pre_ticket_len, payload_len; unsigned int klen, enctype; void *payload, *ticket; - __be32 *t, *p, *q, tmp[2]; + __be32 *t, *p, *q, *tmp; int ret; _enter(""); @@ -59,10 +59,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, rxgk_abort_resp_short_yfs_tkt); /* Get the session key length */ - ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); - if (ret < 0) - return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, - rxgk_abort_resp_short_yfs_klen); + tmp = buffer; enctype = ntohl(tmp[0]); klen = ntohl(tmp[1]); @@ -84,12 +81,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, * it. */ ticket = payload + pre_ticket_len; - ret = skb_copy_bits(skb, ticket_offset, ticket, ticket_len); - if (ret < 0) { - ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, - rxgk_abort_resp_short_yfs_tkt); - goto error; - } + memcpy(ticket, buffer, ticket_len); /* Fill out the form header. */ p = payload; @@ -131,7 +123,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, goto error; } - /* Ticket read in with skb_copy_bits above */ + /* Ticket appended above. */ q += xdr_round_up(ticket_len) / 4; if (WARN_ON((unsigned long)q - (unsigned long)payload != payload_len)) { ret = -EIO; @@ -182,14 +174,15 @@ error: * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-afs-08 sec 6.1] */ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, - unsigned int token_offset, unsigned int token_len, + void *token, unsigned int token_len, struct key **_key) { const struct krb5_enctype *krb5; const struct krb5_buffer *server_secret; struct crypto_aead *token_enc = NULL; struct key *server_key; - unsigned int ticket_offset, ticket_len; + unsigned int ticket_len; + void *ticket; u32 kvno, enctype; int ret, ec = 0; @@ -197,24 +190,23 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, __be32 kvno; __be32 enctype; __be32 token_len; - } container; + } *container; - if (token_len < sizeof(container)) + if (token_len < sizeof(*container)) goto short_packet; /* Decode the RXGK_TokenContainer object. This tells us which server * key we should be using. We can then fetch the key, get the secret * and set up the crypto to extract the token. */ - if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) - goto short_packet; + container = token; + token += sizeof(*container); - kvno = ntohl(container.kvno); - enctype = ntohl(container.enctype); - ticket_len = ntohl(container.token_len); - ticket_offset = token_offset + sizeof(container); + kvno = ntohl(container->kvno); + enctype = ntohl(container->enctype); + ticket_len = ntohl(container->token_len); - if (ticket_len > xdr_round_down(token_len - sizeof(container))) + if (ticket_len > xdr_round_down(token_len - sizeof(*container))) goto short_packet; _debug("KVNO %u", kvno); @@ -237,8 +229,8 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, * gain access to K0, from which we can derive the transport key and * thence decode the authenticator. */ - ret = rxgk_decrypt_skb(krb5, token_enc, skb, - &ticket_offset, &ticket_len, &ec); + ticket = token; + ret = rxgk_decrypt(krb5, token_enc, &ticket, &ticket_len, &ec); crypto_free_aead(token_enc); token_enc = NULL; if (ret < 0) { @@ -248,7 +240,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, return ret; } - ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, + ret = conn->security->default_decode_ticket(conn, skb, ticket, ticket_len, _key); if (ret < 0) goto cant_get_token; diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h index 1e257d7ab8ec..3deed5863f5a 100644 --- a/net/rxrpc/rxgk_common.h +++ b/net/rxrpc/rxgk_common.h @@ -41,10 +41,10 @@ struct rxgk_context { * rxgk_app.c */ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, - unsigned int ticket_offset, unsigned int ticket_len, + void *ticket, unsigned int ticket_len, struct key **_key); int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, - unsigned int token_offset, unsigned int token_len, + void *token, unsigned int token_len, struct key **_key); /* @@ -62,31 +62,30 @@ int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, gfp_t gfp); /* - * Apply decryption and checksumming functions to part of an skbuff. The - * offset and length are updated to reflect the actual content of the encrypted + * Apply decryption and checksumming functions a flat data buffer. The data + * point and length are updated to reflect the actual content of the encrypted * region. */ -static inline -int rxgk_decrypt_skb(const struct krb5_enctype *krb5, - struct crypto_aead *aead, - struct sk_buff *skb, - unsigned int *_offset, unsigned int *_len, - int *_error_code) +static inline int rxgk_decrypt(const struct krb5_enctype *krb5, + struct crypto_aead *aead, + void **_data, unsigned int *_len, + int *_error_code) { - struct scatterlist sg[16]; + struct scatterlist sg[1]; size_t offset = 0, len = *_len; - int nr_sg, ret; + int ret; - sg_init_table(sg, ARRAY_SIZE(sg)); - nr_sg = skb_to_sgvec(skb, sg, *_offset, len); - if (unlikely(nr_sg < 0)) - return nr_sg; + sg_init_one(sg, *_data, len); - ret = crypto_krb5_decrypt(krb5, aead, sg, nr_sg, - &offset, &len); + ret = crypto_krb5_decrypt(krb5, aead, sg, 1, &offset, &len); switch (ret) { case 0: - *_offset += offset; + if (offset & 3) { + *_error_code = RXGK_INCONSISTENCY; + ret = -EPROTO; + break; + } + *_data += offset; *_len = len; break; case -EBADMSG: /* Checksum mismatch. */ @@ -106,31 +105,26 @@ int rxgk_decrypt_skb(const struct krb5_enctype *krb5, } /* - * Check the MIC on a region of an skbuff. The offset and length are updated - * to reflect the actual content of the secure region. + * Check the MIC on a flat buffer. The data pointer and length are updated to + * reflect the actual content of the secure region. */ static inline -int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, - struct crypto_shash *shash, - const struct krb5_buffer *metadata, - struct sk_buff *skb, - unsigned int *_offset, unsigned int *_len, - u32 *_error_code) +int rxgk_verify_mic(const struct krb5_enctype *krb5, + struct crypto_shash *shash, + const struct krb5_buffer *metadata, + void **_data, unsigned int *_len, + u32 *_error_code) { - struct scatterlist sg[16]; + struct scatterlist sg[1]; size_t offset = 0, len = *_len; - int nr_sg, ret; + int ret; - sg_init_table(sg, ARRAY_SIZE(sg)); - nr_sg = skb_to_sgvec(skb, sg, *_offset, len); - if (unlikely(nr_sg < 0)) - return nr_sg; + sg_init_one(sg, *_data, len); - ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, nr_sg, - &offset, &len); + ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, 1, &offset, &len); switch (ret) { case 0: - *_offset += offset; + *_data += offset; *_len = len; break; case -EBADMSG: /* Checksum mismatch */ diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index cba7935977f0..6fbd883401ac 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -430,27 +430,25 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_seq_t seq, struct skcipher_request *req) { - struct rxkad_level1_hdr sechdr; + struct rxkad_level1_hdr *sechdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv; - struct scatterlist sg[16]; - u32 data_size, buf; + struct scatterlist sg[1]; + void *data = call->rx_dec_buffer; + u32 len = sp->len, data_size, buf; u16 check; int ret; _enter(""); - if (sp->len < 8) + if (len < 8) return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, rxkad_abort_1_short_header); /* Decrypt the skbuff in-place. TODO: We really want to decrypt * directly into the target buffer. */ - sg_init_table(sg, ARRAY_SIZE(sg)); - ret = skb_to_sgvec(skb, sg, sp->offset, 8); - if (unlikely(ret < 0)) - return ret; + sg_init_one(sg, data, len); /* start the decryption afresh */ memset(&iv, 0, sizeof(iv)); @@ -464,13 +462,11 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, return ret; /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) - return rxrpc_abort_eproto(call, skb, RXKADDATALEN, - rxkad_abort_1_short_encdata); - sp->offset += sizeof(sechdr); - sp->len -= sizeof(sechdr); + sechdr = data; + call->rx_dec_offset = sizeof(*sechdr); + len -= sizeof(*sechdr); - buf = ntohl(sechdr.data_size); + buf = ntohl(sechdr->data_size); data_size = buf & 0xffff; check = buf >> 16; @@ -479,10 +475,10 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, if (check != 0) return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, rxkad_abort_1_short_check); - if (data_size > sp->len) + if (data_size > len) return rxrpc_abort_eproto(call, skb, RXKADDATALEN, rxkad_abort_1_short_data); - sp->len = data_size; + call->rx_dec_len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; @@ -496,43 +492,28 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, struct skcipher_request *req) { const struct rxrpc_key_token *token; - struct rxkad_level2_hdr sechdr; + struct rxkad_level2_hdr *sechdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv; - struct scatterlist _sg[4], *sg; - u32 data_size, buf; + struct scatterlist sg[1]; + void *data = call->rx_dec_buffer; + u32 len = sp->len, data_size, buf; u16 check; - int nsg, ret; + int ret; - _enter(",{%d}", sp->len); + _enter(",{%d}", len); - if (sp->len < 8) + if (len < 8) return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, rxkad_abort_2_short_header); /* Don't let the crypto algo see a misaligned length. */ - sp->len = round_down(sp->len, 8); + len = round_down(len, 8); - /* Decrypt the skbuff in-place. TODO: We really want to decrypt - * directly into the target buffer. + /* Decrypt in place in the call's decryption buffer. TODO: We really + * want to decrypt directly into the target buffer. */ - sg = _sg; - nsg = skb_shinfo(skb)->nr_frags + 1; - if (nsg <= 4) { - nsg = 4; - } else { - sg = kmalloc_objs(*sg, nsg, GFP_NOIO); - if (!sg) - return -ENOMEM; - } - - sg_init_table(sg, nsg); - ret = skb_to_sgvec(skb, sg, sp->offset, sp->len); - if (unlikely(ret < 0)) { - if (sg != _sg) - kfree(sg); - return ret; - } + sg_init_one(sg, data, len); /* decrypt from the session key */ token = call->conn->key->payload.data[0]; @@ -540,11 +521,9 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, sp->len, iv.x); + skcipher_request_set_crypt(req, sg, sg, len, iv.x); ret = crypto_skcipher_decrypt(req); skcipher_request_zero(req); - if (sg != _sg) - kfree(sg); if (ret < 0) { if (ret == -ENOMEM) return ret; @@ -553,13 +532,11 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, } /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) - return rxrpc_abort_eproto(call, skb, RXKADDATALEN, - rxkad_abort_2_short_len); - sp->offset += sizeof(sechdr); - sp->len -= sizeof(sechdr); + sechdr = data; + call->rx_dec_offset = sizeof(*sechdr); + len -= sizeof(*sechdr); - buf = ntohl(sechdr.data_size); + buf = ntohl(sechdr->data_size); data_size = buf & 0xffff; check = buf >> 16; @@ -569,17 +546,18 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, rxkad_abort_2_short_check); - if (data_size > sp->len) + if (data_size > len) return rxrpc_abort_eproto(call, skb, RXKADDATALEN, rxkad_abort_2_short_data); - sp->len = data_size; + call->rx_dec_len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; } /* - * Verify the security on a received packet and the subpackets therein. + * Verify the security on a received (sub)packet. If the packet needs + * modifying (e.g. decrypting), it must be copied. */ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) { @@ -985,7 +963,6 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, *_expiry = 0; ASSERT(server_key->payload.data[0] != NULL); - ASSERTCMP((unsigned long) ticket & 7UL, ==, 0); memcpy(&iv, &server_key->payload.data[2], sizeof(iv)); @@ -1134,14 +1111,15 @@ unlock: * verify a response */ static int rxkad_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb) + struct sk_buff *skb, + void *buffer, unsigned int len) { struct rxkad_response *response; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt session_key; struct key *server_key; time64_t expiry; - void *ticket = NULL; + void *ticket; u32 version, kvno, ticket_len, level; __be32 csum; int ret, i; @@ -1164,13 +1142,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, } } - ret = -ENOMEM; - response = kzalloc_obj(struct rxkad_response, GFP_NOFS); - if (!response) - goto error; - - if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - response, sizeof(*response)) < 0) { + response = buffer; + if (len < sizeof(*response)) { ret = rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, rxkad_abort_resp_short); goto error; @@ -1182,6 +1155,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, trace_rxrpc_rx_response(conn, sp->hdr.serial, version, kvno, ticket_len); + buffer += sizeof(*response); + len -= sizeof(*response); + if (version != RXKAD_VERSION) { ret = rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, rxkad_abort_resp_version); @@ -1201,13 +1177,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, } /* extract the kerberos ticket and decrypt and decode it */ - ret = -ENOMEM; - ticket = kmalloc(ticket_len, GFP_NOFS); - if (!ticket) - goto error; - - if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), - ticket, ticket_len) < 0) { + ticket = buffer; + if (ticket_len > len) { ret = rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, rxkad_abort_resp_short_tkt); goto error; @@ -1287,8 +1258,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno); error: - kfree(ticket); - kfree(response); key_put(server_key); _leave(" = %d", ret); return ret; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 332fd9695e54..04ea11c90e03 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -112,11 +112,6 @@ struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, } EXPORT_SYMBOL(tcf_action_set_ctrlact); -/* XXX: For standalone actions, we don't need a RCU grace period either, because - * actions are always connected to filters and filters are already destroyed in - * RCU callbacks, so after a RCU grace period actions are already disconnected - * from filters. Readers later can not find us. - */ static void free_tcf(struct tc_action *p) { struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1); @@ -129,7 +124,7 @@ static void free_tcf(struct tc_action *p) if (chain) tcf_chain_put_by_act(chain); - kfree(p); + kfree_rcu(p, tcfa_rcu); } static void offload_action_hw_count_set(struct tc_action *act, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 2c5a7a321a94..553342c55cf7 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -26,6 +26,10 @@ #include <net/tc_act/tc_mirred.h> #include <net/tc_wrapper.h> +#define MIRRED_DEFER_LIMIT 3 +_Static_assert(MIRRED_DEFER_LIMIT <= 3, + "MIRRED_DEFER_LIMIT exceeds tc_depth bitfield width"); + static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); @@ -234,12 +238,15 @@ tcf_mirred_forward(bool at_ingress, bool want_ingress, struct sk_buff *skb) { int err; - if (!want_ingress) + if (!want_ingress) { err = tcf_dev_queue_xmit(skb, dev_queue_xmit); - else if (!at_ingress) - err = netif_rx(skb); - else - err = netif_receive_skb(skb); + } else { + skb->tc_depth++; + if (!at_ingress) + err = netif_rx(skb); + else + err = netif_receive_skb(skb); + } return err; } @@ -365,7 +372,8 @@ assign_prev: dev_is_mac_header_xmit(dev_prev), m_eaction, retval); - return retval; + /* If the packet wasn't redirected, we have to register as a drop */ + return TC_ACT_SHOT; } static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m, @@ -389,14 +397,12 @@ static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m, static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m, const u32 blockid, struct tcf_result *res, - int retval) + int m_eaction, int retval) { const u32 exception_ifindex = skb->dev->ifindex; struct tcf_block *block; bool is_redirect; - int m_eaction; - m_eaction = READ_ONCE(m->tcfm_eaction); is_redirect = tcf_mirred_is_act_redirect(m_eaction); /* we are already under rcu protection, so can call block lookup @@ -405,7 +411,7 @@ static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m, block = tcf_block_lookup(dev_net(skb->dev), blockid); if (!block || xa_empty(&block->ports)) { tcf_action_inc_overlimit_qstats(&m->common); - return retval; + return is_redirect ? TC_ACT_SHOT : retval; } if (is_redirect) @@ -423,9 +429,10 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, { struct tcf_mirred *m = to_mirred(a); int retval = READ_ONCE(m->tcf_action); + bool m_mac_header_xmit, is_redirect; struct netdev_xmit *xmit; - bool m_mac_header_xmit; struct net_device *dev; + bool want_ingress; int i, m_eaction; u32 blockid; @@ -434,7 +441,8 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, #else xmit = this_cpu_ptr(&softnet_data.xmit); #endif - if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) { + if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT || + skb->tc_depth >= MIRRED_DEFER_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); return TC_ACT_SHOT; @@ -444,34 +452,51 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, tcf_action_update_bstats(&m->common, skb); blockid = READ_ONCE(m->tcfm_blockid); - if (blockid) - return tcf_blockcast(skb, m, blockid, res, retval); + m_eaction = READ_ONCE(m->tcfm_eaction); + want_ingress = tcf_mirred_act_wants_ingress(m_eaction); + if (blockid) { + if (!want_ingress) + xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = NULL; + retval = tcf_blockcast(skb, m, blockid, res, m_eaction, retval); + if (!want_ingress) + xmit->sched_mirred_nest--; + return retval; + } + + is_redirect = tcf_mirred_is_act_redirect(m_eaction); dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); tcf_action_inc_overlimit_qstats(&m->common); - return retval; - } - for (i = 0; i < xmit->sched_mirred_nest; i++) { - if (xmit->sched_mirred_dev[i] != dev) - continue; - pr_notice_once("tc mirred: loop on device %s\n", - netdev_name(dev)); - tcf_action_inc_overlimit_qstats(&m->common); - return retval; + goto err_out; } - xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev; + if (!want_ingress) { + for (i = 0; i < xmit->sched_mirred_nest; i++) { + if (xmit->sched_mirred_dev[i] != dev) + continue; + pr_notice_once("tc mirred: loop on device %s\n", + netdev_name(dev)); + tcf_action_inc_overlimit_qstats(&m->common); + goto err_out; + } + xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev; + } m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); - m_eaction = READ_ONCE(m->tcfm_eaction); retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction, retval); - xmit->sched_mirred_nest--; + if (!want_ingress) + xmit->sched_mirred_nest--; return retval; + +err_out: + if (is_redirect) + retval = TC_ACT_SHOT; + return retval; } static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets, diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index bc20f08a2789..bd3b1da3cd63 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -16,6 +16,8 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/slab.h> +#include <linux/overflow.h> +#include <linux/unaligned.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/pkt_sched.h> @@ -242,7 +244,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_free_ex; } - nparms->tcfp_off_max_hint = 0; nparms->tcfp_flags = parm->flags; nparms->tcfp_nkeys = parm->nkeys; @@ -268,14 +269,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, BITS_PER_TYPE(int) - 1, nparms->tcfp_keys[i].shift); - /* The AT option can read a single byte, we can bound the actual - * value with uchar max. - */ - cur += (0xff & offmask) >> nparms->tcfp_keys[i].shift; - - /* Each key touches 4 bytes starting from the computed offset */ - nparms->tcfp_off_max_hint = - max(nparms->tcfp_off_max_hint, cur + 4); } p = to_pedit(*a); @@ -318,15 +311,12 @@ static void tcf_pedit_cleanup(struct tc_action *a) call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu); } -static bool offset_valid(struct sk_buff *skb, int offset) +static bool offset_valid(struct sk_buff *skb, int offset, int len) { - if (offset > 0 && offset > skb->len) - return false; - - if (offset < 0 && -offset > skb_headroom(skb)) + if (offset < -(int)skb_headroom(skb)) return false; - return true; + return offset <= (int)skb->len - len; } static int pedit_l4_skb_offset(struct sk_buff *skb, int *hoffset, const int header_type) @@ -393,18 +383,10 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, struct tcf_pedit_key_ex *tkey_ex; struct tcf_pedit_parms *parms; struct tc_pedit_key *tkey; - u32 max_offset; int i; parms = rcu_dereference_bh(p->parms); - max_offset = (skb_transport_header_was_set(skb) ? - skb_transport_offset(skb) : - skb_network_offset(skb)) + - parms->tcfp_off_max_hint; - if (skb_ensure_writable(skb, min(skb->len, max_offset))) - goto done; - tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); @@ -412,10 +394,11 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, tkey_ex = parms->tcfp_keys_ex; for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) { + int write_offset, write_len; int offset = tkey->off; int hoffset = 0; - u32 *ptr, hdata; - u32 val; + u32 cur_val, val; + u32 *ptr; int rc; if (tkey_ex) { @@ -433,13 +416,15 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, if (tkey->offmask) { u8 *d, _d; + int at_offset; - if (!offset_valid(skb, hoffset + tkey->at)) { + if (check_add_overflow(hoffset, (int)tkey->at, &at_offset) || + !offset_valid(skb, at_offset, sizeof(_d))) { pr_info_ratelimited("tc action pedit 'at' offset %d out of bounds\n", hoffset + tkey->at); goto bad; } - d = skb_header_pointer(skb, hoffset + tkey->at, + d = skb_header_pointer(skb, at_offset, sizeof(_d), &_d); if (!d) goto bad; @@ -451,31 +436,51 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, } } - if (!offset_valid(skb, hoffset + offset)) { - pr_info_ratelimited("tc action pedit offset %d out of bounds\n", hoffset + offset); + if (check_add_overflow(hoffset, offset, &write_offset)) { + pr_info_ratelimited("tc action pedit offset overflow\n"); goto bad; } - ptr = skb_header_pointer(skb, hoffset + offset, - sizeof(hdata), &hdata); - if (!ptr) + if (!offset_valid(skb, write_offset, sizeof(*ptr))) { + pr_info_ratelimited("tc action pedit offset %d out of bounds\n", + write_offset); goto bad; + } + + if (write_offset < 0) { + if (skb_cow(skb, -write_offset)) + goto bad; + if (write_offset + (int)sizeof(*ptr) > 0) { + if (skb_ensure_writable(skb, + min_t(int, skb->len, + write_offset + (int)sizeof(*ptr)))) + goto bad; + } + } else { + if (check_add_overflow(write_offset, (int)sizeof(*ptr), + &write_len)) + goto bad; + if (skb_ensure_writable(skb, min_t(int, skb->len, + write_len))) + goto bad; + } + + ptr = (u32 *)(skb->data + write_offset); + cur_val = get_unaligned(ptr); /* just do it, baby */ switch (cmd) { case TCA_PEDIT_KEY_EX_CMD_SET: val = tkey->val; break; case TCA_PEDIT_KEY_EX_CMD_ADD: - val = (*ptr + tkey->val) & ~tkey->mask; + val = (cur_val + tkey->val) & ~tkey->mask; break; default: pr_info_ratelimited("tc action pedit bad command (%d)\n", cmd); goto bad; } - *ptr = ((*ptr & tkey->mask) ^ val); - if (ptr == &hdata) - skb_store_bits(skb, hoffset + offset, ptr, 4); + put_unaligned((cur_val & tkey->mask) ^ val, ptr); } goto done; diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 13c6d1869a14..5862933be8d7 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -399,14 +399,14 @@ static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust); * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 */ -static void cobalt_newton_step(struct cobalt_vars *vars) +static void cobalt_newton_step(struct cobalt_vars *vars, u32 count) { u32 invsqrt, invsqrt2; u64 val; invsqrt = vars->rec_inv_sqrt; invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; - val = (3LL << 32) - ((u64)vars->count * invsqrt2); + val = (3LL << 32) - ((u64)count * invsqrt2); val >>= 2; /* avoid overflow in following multiply */ val = (val * invsqrt) >> (32 - 2 + 1); @@ -414,12 +414,12 @@ static void cobalt_newton_step(struct cobalt_vars *vars) vars->rec_inv_sqrt = val; } -static void cobalt_invsqrt(struct cobalt_vars *vars) +static void cobalt_invsqrt(struct cobalt_vars *vars, u32 count) { - if (vars->count < REC_INV_SQRT_CACHE) - vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; + if (count < REC_INV_SQRT_CACHE) + vars->rec_inv_sqrt = inv_sqrt_cache[count]; else - cobalt_newton_step(vars); + cobalt_newton_step(vars, count); } static void cobalt_vars_init(struct cobalt_vars *vars) @@ -449,16 +449,19 @@ static bool cobalt_queue_full(struct cobalt_vars *vars, bool up = false; if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { - up = !vars->p_drop; - vars->p_drop += p->p_inc; - if (vars->p_drop < p->p_inc) - vars->p_drop = ~0; - vars->blue_timer = now; - } - vars->dropping = true; - vars->drop_next = now; + u32 p_drop = vars->p_drop; + + up = !p_drop; + p_drop += p->p_inc; + if (p_drop < p->p_inc) + p_drop = ~0; + WRITE_ONCE(vars->p_drop, p_drop); + WRITE_ONCE(vars->blue_timer, now); + } + WRITE_ONCE(vars->dropping, true); + WRITE_ONCE(vars->drop_next, now); if (!vars->count) - vars->count = 1; + WRITE_ONCE(vars->count, 1); return up; } @@ -475,20 +478,20 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars, if (vars->p_drop && ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { if (vars->p_drop < p->p_dec) - vars->p_drop = 0; + WRITE_ONCE(vars->p_drop, 0); else - vars->p_drop -= p->p_dec; - vars->blue_timer = now; + WRITE_ONCE(vars->p_drop, vars->p_drop - p->p_dec); + WRITE_ONCE(vars->blue_timer, now); down = !vars->p_drop; } - vars->dropping = false; + WRITE_ONCE(vars->dropping, false); if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { - vars->count--; - cobalt_invsqrt(vars); - vars->drop_next = cobalt_control(vars->drop_next, - p->interval, - vars->rec_inv_sqrt); + WRITE_ONCE(vars->count, vars->count - 1); + cobalt_invsqrt(vars, vars->count); + WRITE_ONCE(vars->drop_next, + cobalt_control(vars->drop_next, p->interval, + vars->rec_inv_sqrt)); } return down; @@ -507,6 +510,7 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, bool next_due, over_target; ktime_t schedule; u64 sojourn; + u32 count; /* The 'schedule' variable records, in its sign, whether 'now' is before or * after 'drop_next'. This allows 'drop_next' to be updated before the next @@ -528,21 +532,22 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, over_target = sojourn > p->target && sojourn > p->mtu_time * bulk_flows * 2 && sojourn > p->mtu_time * 4; - next_due = vars->count && ktime_to_ns(schedule) >= 0; + count = vars->count; + next_due = count && ktime_to_ns(schedule) >= 0; vars->ecn_marked = false; if (over_target) { if (!vars->dropping) { - vars->dropping = true; - vars->drop_next = cobalt_control(now, - p->interval, - vars->rec_inv_sqrt); + WRITE_ONCE(vars->dropping, true); + WRITE_ONCE(vars->drop_next, + cobalt_control(now, p->interval, + vars->rec_inv_sqrt)); } - if (!vars->count) - vars->count = 1; + if (!count) + count = 1; } else if (vars->dropping) { - vars->dropping = false; + WRITE_ONCE(vars->dropping, false); } if (next_due && vars->dropping) { @@ -550,23 +555,23 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) reason = QDISC_DROP_CONGESTED; - vars->count++; - if (!vars->count) - vars->count--; - cobalt_invsqrt(vars); - vars->drop_next = cobalt_control(vars->drop_next, - p->interval, - vars->rec_inv_sqrt); + count++; + if (!count) + count--; + cobalt_invsqrt(vars, count); + WRITE_ONCE(vars->drop_next, + cobalt_control(vars->drop_next, p->interval, + vars->rec_inv_sqrt)); schedule = ktime_sub(now, vars->drop_next); } else { while (next_due) { - vars->count--; - cobalt_invsqrt(vars); - vars->drop_next = cobalt_control(vars->drop_next, - p->interval, - vars->rec_inv_sqrt); + count--; + cobalt_invsqrt(vars, count); + WRITE_ONCE(vars->drop_next, + cobalt_control(vars->drop_next, p->interval, + vars->rec_inv_sqrt)); schedule = ktime_sub(now, vars->drop_next); - next_due = vars->count && ktime_to_ns(schedule) >= 0; + next_due = count && ktime_to_ns(schedule) >= 0; } } @@ -575,11 +580,12 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, get_random_u32() < vars->p_drop) reason = QDISC_DROP_FLOOD_PROTECTION; + WRITE_ONCE(vars->count, count); /* Overload the drop_next field as an activity timeout */ - if (!vars->count) - vars->drop_next = ktime_add_ns(now, p->interval); + if (!count) + WRITE_ONCE(vars->drop_next, ktime_add_ns(now, p->interval)); else if (ktime_to_ns(schedule) > 0 && reason == QDISC_DROP_UNSPEC) - vars->drop_next = now; + WRITE_ONCE(vars->drop_next, now); return reason; } @@ -914,7 +920,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow) struct sk_buff *skb = flow->head; if (skb) { - flow->head = skb->next; + WRITE_ONCE(flow->head, skb->next); skb_mark_not_on_list(skb); } @@ -926,7 +932,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow) static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) { if (!flow->head) - flow->head = skb; + WRITE_ONCE(flow->head, skb); else flow->tail->next = skb; flow->tail = skb; @@ -1357,7 +1363,7 @@ found: if (elig_ack_prev) elig_ack_prev->next = elig_ack->next; else - flow->head = elig_ack->next; + WRITE_ONCE(flow->head, elig_ack->next); skb_mark_not_on_list(elig_ack); @@ -1595,11 +1601,11 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) len = qdisc_pkt_len(skb); q->buffer_used -= skb->truesize; - b->backlogs[idx] -= len; WRITE_ONCE(b->tin_backlog, b->tin_backlog - len); + WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] - len); sch->qstats.backlog -= len; - flow->dropped++; + WRITE_ONCE(flow->dropped, flow->dropped + 1); WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1); if (q->config->rate_flags & CAKE_FLAG_INGRESS) @@ -1824,11 +1830,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } /* stats */ - b->backlogs[idx] += slen; sch->qstats.backlog += slen; q->avg_window_bytes += slen; WRITE_ONCE(b->bytes, b->bytes + slen); WRITE_ONCE(b->tin_backlog, b->tin_backlog + slen); + WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + slen); qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); consume_skb(skb); @@ -1861,11 +1867,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* stats */ WRITE_ONCE(b->packets, b->packets + 1); - b->backlogs[idx] += len - ack_pkt_len; sch->qstats.backlog += len - ack_pkt_len; q->avg_window_bytes += len - ack_pkt_len; WRITE_ONCE(b->bytes, b->bytes + len - ack_pkt_len); WRITE_ONCE(b->tin_backlog, b->tin_backlog + len - ack_pkt_len); + WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + len - ack_pkt_len); } if (q->overflow_timeout) @@ -1924,7 +1930,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count + 1); - flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode); + WRITE_ONCE(flow->deficit, cake_get_flow_quantum(b, flow, q->config->flow_mode)); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. @@ -1977,7 +1983,7 @@ static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) if (flow->head) { skb = dequeue_head(flow); len = qdisc_pkt_len(skb); - b->backlogs[q->cur_flow] -= len; + WRITE_ONCE(b->backlogs[q->cur_flow], b->backlogs[q->cur_flow] - len); WRITE_ONCE(b->tin_backlog, b->tin_backlog - len); sch->qstats.backlog -= len; q->buffer_used -= skb->truesize; @@ -2166,7 +2172,8 @@ retry: } } - flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode); + WRITE_ONCE(flow->deficit, + flow->deficit + cake_get_flow_quantum(b, flow, q->config->flow_mode)); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2232,10 +2239,10 @@ retry: if (q->config->rate_flags & CAKE_FLAG_INGRESS) { len = cake_advance_shaper(q, b, skb, now, true); - flow->deficit -= len; + WRITE_ONCE(flow->deficit, flow->deficit - len); b->tin_deficit -= len; } - flow->dropped++; + WRITE_ONCE(flow->dropped, flow->dropped + 1); WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); @@ -2259,7 +2266,7 @@ retry: delay < b->base_delay ? 2 : 8)); len = cake_advance_shaper(q, b, skb, now, false); - flow->deficit -= len; + WRITE_ONCE(flow->deficit, flow->deficit - len); b->tin_deficit -= len; if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { @@ -3137,7 +3144,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, flow = &b->flows[idx % CAKE_QUEUES]; - if (flow->head) { + if (READ_ONCE(flow->head)) { sch_tree_lock(sch); skb = flow->head; while (skb) { @@ -3146,13 +3153,15 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, } sch_tree_unlock(sch); } - qs.backlog = b->backlogs[idx % CAKE_QUEUES]; - qs.drops = flow->dropped; + qs.backlog = READ_ONCE(b->backlogs[idx % CAKE_QUEUES]); + qs.drops = READ_ONCE(flow->dropped); } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; if (flow) { ktime_t now = ktime_get(); + bool dropping; + u32 p_drop; stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); if (!stats) @@ -3167,21 +3176,23 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, goto nla_put_failure; \ } while (0) - PUT_STAT_S32(DEFICIT, flow->deficit); - PUT_STAT_U32(DROPPING, flow->cvars.dropping); - PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); - PUT_STAT_U32(P_DROP, flow->cvars.p_drop); - if (flow->cvars.p_drop) { + PUT_STAT_S32(DEFICIT, READ_ONCE(flow->deficit)); + dropping = READ_ONCE(flow->cvars.dropping); + PUT_STAT_U32(DROPPING, dropping); + PUT_STAT_U32(COBALT_COUNT, READ_ONCE(flow->cvars.count)); + p_drop = READ_ONCE(flow->cvars.p_drop); + PUT_STAT_U32(P_DROP, p_drop); + if (p_drop) { PUT_STAT_S32(BLUE_TIMER_US, ktime_to_us( ktime_sub(now, - flow->cvars.blue_timer))); + READ_ONCE(flow->cvars.blue_timer)))); } - if (flow->cvars.dropping) { + if (dropping) { PUT_STAT_S32(DROP_NEXT_US, ktime_to_us( ktime_sub(now, - flow->cvars.drop_next))); + READ_ONCE(flow->cvars.drop_next)))); } if (nla_nest_end(d->skb, stats) < 0) diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 8c9a0400c862..0f953bd46b58 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -243,6 +243,20 @@ static struct sk_buff *cbs_dequeue(struct Qdisc *sch) return q->dequeue(sch); } +static void cbs_reset(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + /* Nothing to do if we couldn't create the underlying qdisc */ + if (!q->qdisc) + return; + + qdisc_reset(q->qdisc); + qdisc_watchdog_cancel(&q->watchdog); + q->credits = 0; + q->last = 0; +} + static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = { [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) }, }; @@ -540,7 +554,7 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .dequeue = cbs_dequeue, .peek = qdisc_peek_dequeued, .init = cbs_init, - .reset = qdisc_reset_queue, + .reset = cbs_reset, .destroy = cbs_destroy, .change = cbs_change, .dump = cbs_dump, diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 241e6a46bd00..a22489c14458 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -938,6 +938,8 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, int err; sch->flags |= TCQ_F_DEQUEUE_DROPS; + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, 1), extack); @@ -950,8 +952,6 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, q->sch = sch; dualpi2_reset_default(sch); - hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED_SOFT); if (opt && nla_len(opt)) { err = dualpi2_change(sch, opt, extack); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 0664b2f2d6f2..24db54684e8a 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -117,7 +117,7 @@ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) { struct sk_buff *skb = flow->head; - flow->head = skb->next; + WRITE_ONCE(flow->head, skb->next); skb_mark_not_on_list(skb); return skb; } @@ -127,7 +127,7 @@ static inline void flow_queue_add(struct fq_codel_flow *flow, struct sk_buff *skb) { if (flow->head == NULL) - flow->head = skb; + WRITE_ONCE(flow->head, skb); else flow->tail->next = skb; flow->tail = skb; @@ -173,8 +173,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, } while (++i < max_packets && len < threshold); /* Tell codel to increase its signal strength also */ - flow->cvars.count += i; - q->backlogs[idx] -= len; + WRITE_ONCE(flow->cvars.count, flow->cvars.count + i); + WRITE_ONCE(q->backlogs[idx], q->backlogs[idx] - len); q->memory_usage -= mem; sch->qstats.drops += i; sch->qstats.backlog -= len; @@ -204,13 +204,13 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, codel_set_enqueue_time(skb); flow = &q->flows[idx]; flow_queue_add(flow, skb); - q->backlogs[idx] += qdisc_pkt_len(skb); + WRITE_ONCE(q->backlogs[idx], q->backlogs[idx] + qdisc_pkt_len(skb)); qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&flow->flowchain)) { list_add_tail(&flow->flowchain, &q->new_flows); q->new_flow_count++; - flow->deficit = q->quantum; + WRITE_ONCE(flow->deficit, q->quantum); } get_codel_cb(skb)->mem_usage = skb->truesize; q->memory_usage += get_codel_cb(skb)->mem_usage; @@ -263,7 +263,8 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) flow = container_of(vars, struct fq_codel_flow, cvars); if (flow->head) { skb = dequeue_head(flow); - q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); + WRITE_ONCE(q->backlogs[flow - q->flows], + q->backlogs[flow - q->flows] - qdisc_pkt_len(skb)); q->memory_usage -= get_codel_cb(skb)->mem_usage; sch->q.qlen--; sch->qstats.backlog -= qdisc_pkt_len(skb); @@ -296,7 +297,7 @@ begin: flow = list_first_entry(head, struct fq_codel_flow, flowchain); if (flow->deficit <= 0) { - flow->deficit += q->quantum; + WRITE_ONCE(flow->deficit, flow->deficit + q->quantum); list_move_tail(&flow->flowchain, &q->old_flows); goto begin; } @@ -314,7 +315,7 @@ begin: goto begin; } qdisc_bstats_update(sch, skb); - flow->deficit -= qdisc_pkt_len(skb); + WRITE_ONCE(flow->deficit, flow->deficit - qdisc_pkt_len(skb)); if (q->cstats.drop_count) { qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, @@ -328,7 +329,7 @@ begin: static void fq_codel_flow_purge(struct fq_codel_flow *flow) { rtnl_kfree_skbs(flow->head, flow->tail); - flow->head = NULL; + WRITE_ONCE(flow->head, NULL); } static void fq_codel_reset(struct Qdisc *sch) @@ -656,21 +657,21 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, memset(&xstats, 0, sizeof(xstats)); xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; - xstats.class_stats.deficit = flow->deficit; + xstats.class_stats.deficit = READ_ONCE(flow->deficit); xstats.class_stats.ldelay = - codel_time_to_us(flow->cvars.ldelay); - xstats.class_stats.count = flow->cvars.count; - xstats.class_stats.lastcount = flow->cvars.lastcount; - xstats.class_stats.dropping = flow->cvars.dropping; - if (flow->cvars.dropping) { - codel_tdiff_t delta = flow->cvars.drop_next - + codel_time_to_us(READ_ONCE(flow->cvars.ldelay)); + xstats.class_stats.count = READ_ONCE(flow->cvars.count); + xstats.class_stats.lastcount = READ_ONCE(flow->cvars.lastcount); + xstats.class_stats.dropping = READ_ONCE(flow->cvars.dropping); + if (xstats.class_stats.dropping) { + codel_tdiff_t delta = READ_ONCE(flow->cvars.drop_next) - codel_get_time(); xstats.class_stats.drop_next = (delta >= 0) ? codel_time_to_us(delta) : -codel_time_to_us(-delta); } - if (flow->head) { + if (READ_ONCE(flow->head)) { sch_tree_lock(sch); skb = flow->head; while (skb) { @@ -679,7 +680,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, } sch_tree_unlock(sch); } - qs.backlog = q->backlogs[idx]; + qs.backlog = READ_ONCE(q->backlogs[idx]); qs.drops = 0; } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index bc18e1976b6e..17a79fe2f091 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -461,7 +461,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb->prev = NULL; /* Random duplication */ - if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) + if (q->duplicate && skb->tc_depth == 0 && + q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) ++count; /* Drop packet? */ @@ -540,11 +541,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, */ if (skb2) { struct Qdisc *rootq = qdisc_root_bh(sch); - u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ - q->duplicate = 0; + skb2->tc_depth++; /* prevent duplicating a dup... */ rootq->enqueue(skb2, rootq, to_free); - q->duplicate = dupsave; skb2 = NULL; } @@ -1007,41 +1006,6 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, return 0; } -static const struct Qdisc_class_ops netem_class_ops; - -static int check_netem_in_tree(struct Qdisc *sch, bool duplicates, - struct netlink_ext_ack *extack) -{ - struct Qdisc *root, *q; - unsigned int i; - - root = qdisc_root_sleeping(sch); - - if (sch != root && root->ops->cl_ops == &netem_class_ops) { - if (duplicates || - ((struct netem_sched_data *)qdisc_priv(root))->duplicate) - goto err; - } - - if (!qdisc_dev(root)) - return 0; - - hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) { - if (sch != q && q->ops->cl_ops == &netem_class_ops) { - if (duplicates || - ((struct netem_sched_data *)qdisc_priv(q))->duplicate) - goto err; - } - } - - return 0; - -err: - NL_SET_ERR_MSG(extack, - "netem: cannot mix duplicating netems with other netems in tree"); - return -EINVAL; -} - /* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) @@ -1118,11 +1082,6 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, q->gap = qopt->gap; q->counter = 0; q->loss = qopt->loss; - - ret = check_netem_in_tree(sch, qopt->duplicate, extack); - if (ret) - goto unlock; - q->duplicate = qopt->duplicate; /* for compatibility with earlier versions. diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index fb53fbf0e328..b41f2def2e2c 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -219,16 +219,14 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, * packet timestamp. */ if (!params->dq_rate_estimator) { - vars->qdelay = now - pie_get_enqueue_time(skb); + WRITE_ONCE(vars->qdelay, + backlog ? now - pie_get_enqueue_time(skb) : 0); if (vars->dq_tstamp != DTIME_INVALID) dtime = now - vars->dq_tstamp; vars->dq_tstamp = now; - if (backlog == 0) - vars->qdelay = 0; - if (dtime == 0) return; @@ -376,7 +374,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) delta += MAX_PROB / (100 / 2); - vars->prob += delta; + WRITE_ONCE(vars->prob, vars->prob + delta); if (delta > 0) { /* prevent overflow */ @@ -401,7 +399,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, if (qdelay == 0 && qdelay_old == 0 && update_prob) /* Reduce drop probability to 98.4% */ - vars->prob -= vars->prob / 64; + WRITE_ONCE(vars->prob, vars->prob - vars->prob / 64); WRITE_ONCE(vars->qdelay, qdelay); vars->backlog_old = backlog; @@ -501,7 +499,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct pie_sched_data *q = qdisc_priv(sch); struct tc_pie_xstats st = { - .prob = q->vars.prob << BITS_PER_BYTE, + .prob = READ_ONCE(q->vars.prob) << BITS_PER_BYTE, .delay = ((u32)PSCHED_TICKS2NS(READ_ONCE(q->vars.qdelay))) / NSEC_PER_USEC, .packets_in = READ_ONCE(q->stats.packets_in), @@ -512,7 +510,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) }; /* avg_dq_rate is only valid if dq_rate_estimator is enabled */ - st.dq_rate_estimating = q->params.dq_rate_estimator; + st.dq_rate_estimating = READ_ONCE(q->params.dq_rate_estimator); /* unscale and return dq_rate in bytes per sec */ if (st.dq_rate_estimating) diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 432b8a3000a5..4d0e44a2e7c6 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -162,7 +162,7 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch) struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; - skb = child->dequeue(child); + skb = qdisc_dequeue_peeked(child); if (skb) { qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index bd5ef561030f..d3ee8e5479b3 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -441,7 +441,7 @@ static struct sk_buff *sfb_dequeue(struct Qdisc *sch) struct Qdisc *child = q->qdisc; struct sk_buff *skb; - skb = child->dequeue(q->qdisc); + skb = qdisc_dequeue_peeked(child); if (skb) { qdisc_bstats_update(sch, skb); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index c3f3181dba54..f39822babf88 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -225,7 +225,8 @@ static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) sfq_unlink(q, x, n, p); - d = q->slots[x].qlen--; + d = q->slots[x].qlen; + WRITE_ONCE(q->slots[x].qlen, d - 1); if (n == p && q->cur_depth == d) q->cur_depth--; sfq_link(q, x); @@ -238,7 +239,8 @@ static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) sfq_unlink(q, x, n, p); - d = ++q->slots[x].qlen; + d = q->slots[x].qlen + 1; + WRITE_ONCE(q->slots[x].qlen, d); if (q->cur_depth < d) q->cur_depth = d; sfq_link(q, x); @@ -298,7 +300,7 @@ static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free) drop: skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); len = qdisc_pkt_len(skb); - slot->backlog -= len; + WRITE_ONCE(slot->backlog, slot->backlog - len); sfq_dec(q, x); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); @@ -314,7 +316,7 @@ drop: q->tail = NULL; /* no more active slots */ else q->tail->next = slot->next; - q->ht[slot->hash] = SFQ_EMPTY_SLOT; + WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); goto drop; } @@ -364,10 +366,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_MAXFLOWS); - q->ht[hash] = x; + WRITE_ONCE(q->ht[hash], x); slot = &q->slots[x]; slot->hash = hash; - slot->backlog = 0; /* should already be 0 anyway... */ + WRITE_ONCE(slot->backlog, 0); /* should already be 0 anyway... */ red_set_vars(&slot->vars); goto enqueue; } @@ -426,7 +428,7 @@ congestion_drop: head = slot_dequeue_head(slot); delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); sch->qstats.backlog -= delta; - slot->backlog -= delta; + WRITE_ONCE(slot->backlog, slot->backlog - delta); qdisc_drop_reason(head, sch, to_free, QDISC_DROP_FLOW_LIMIT); slot_queue_add(slot, skb); @@ -436,7 +438,7 @@ congestion_drop: enqueue: qdisc_qstats_backlog_inc(sch, skb); - slot->backlog += qdisc_pkt_len(skb); + WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb)); slot_queue_add(slot, skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ @@ -452,7 +454,7 @@ enqueue: */ q->tail = slot; /* We could use a bigger initial quantum for new flows */ - slot->allot = q->quantum; + WRITE_ONCE(slot->allot, q->quantum); } if (++sch->q.qlen <= q->limit) return NET_XMIT_SUCCESS; @@ -489,7 +491,7 @@ next_slot: slot = &q->slots[a]; if (slot->allot <= 0) { q->tail = slot; - slot->allot += q->quantum; + WRITE_ONCE(slot->allot, slot->allot + q->quantum); goto next_slot; } skb = slot_dequeue_head(slot); @@ -497,10 +499,10 @@ next_slot: qdisc_bstats_update(sch, skb); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); - slot->backlog -= qdisc_pkt_len(skb); + WRITE_ONCE(slot->backlog, slot->backlog - qdisc_pkt_len(skb)); /* Is the slot empty? */ if (slot->qlen == 0) { - q->ht[slot->hash] = SFQ_EMPTY_SLOT; + WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); next_a = slot->next; if (a == next_a) { q->tail = NULL; /* no more active slots */ @@ -508,7 +510,7 @@ next_slot: } q->tail->next = next_a; } else { - slot->allot -= qdisc_pkt_len(skb); + WRITE_ONCE(slot->allot, slot->allot - qdisc_pkt_len(skb)); } return skb; } @@ -549,9 +551,9 @@ static void sfq_rehash(struct Qdisc *sch) sfq_dec(q, i); __skb_queue_tail(&list, skb); } - slot->backlog = 0; + WRITE_ONCE(slot->backlog, 0); red_set_vars(&slot->vars); - q->ht[slot->hash] = SFQ_EMPTY_SLOT; + WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); } q->tail = NULL; @@ -570,7 +572,7 @@ drop: dropped++; continue; } - q->ht[hash] = x; + WRITE_ONCE(q->ht[hash], x); slot = &q->slots[x]; slot->hash = hash; } @@ -581,7 +583,7 @@ drop: slot->vars.qavg = red_calc_qavg(q->red_parms, &slot->vars, slot->backlog); - slot->backlog += qdisc_pkt_len(skb); + WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb)); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ @@ -591,7 +593,7 @@ drop: q->tail->next = x; } q->tail = slot; - slot->allot = q->quantum; + WRITE_ONCE(slot->allot, q->quantum); } } sch->q.qlen -= dropped; @@ -905,16 +907,16 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct sfq_sched_data *q = qdisc_priv(sch); - sfq_index idx = q->ht[cl - 1]; + sfq_index idx = READ_ONCE(q->ht[cl - 1]); struct gnet_stats_queue qs = { 0 }; struct tc_sfq_xstats xstats = { 0 }; if (idx != SFQ_EMPTY_SLOT) { const struct sfq_slot *slot = &q->slots[idx]; - xstats.allot = slot->allot; - qs.qlen = slot->qlen; - qs.backlog = slot->backlog; + xstats.allot = READ_ONCE(slot->allot); + qs.qlen = READ_ONCE(slot->qlen); + qs.backlog = READ_ONCE(slot->backlog); } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; @@ -930,7 +932,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < q->divisor; i++) { - if (q->ht[i] == SFQ_EMPTY_SLOT) { + if (READ_ONCE(q->ht[i]) == SFQ_EMPTY_SLOT) { arg->count++; continue; } diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 2afb376299fe..d758f5c3e06e 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -266,15 +266,15 @@ static int sctp_sock_dump_one(struct sctp_endpoint *ep, struct sctp_transport *t lock_sock(sk); - rep = nlmsg_new(inet_assoc_attr_size(sk, assoc), GFP_KERNEL); - if (!rep) { - release_sock(sk); - return -ENOMEM; + if (ep != assoc->ep || assoc->base.dead) { + err = -ESTALE; + goto out_unlock; } - if (ep != assoc->ep) { - err = -EAGAIN; - goto out; + rep = nlmsg_new(inet_assoc_attr_size(sk, assoc), GFP_KERNEL); + if (!rep) { + err = -ENOMEM; + goto out_unlock; } err = inet_sctp_diag_fill(sk, assoc, rep, req, sk_user_ns(NETLINK_CB(skb).sk), @@ -289,8 +289,9 @@ static int sctp_sock_dump_one(struct sctp_endpoint *ep, struct sctp_transport *t return nlmsg_unicast(sock_net(skb->sk)->diag_nlsk, rep, NETLINK_CB(skb).portid); out: - release_sock(sk); kfree_skb(rep); +out_unlock: + release_sock(sk); return err; } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index de86ac088289..85264862fb6b 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1730,6 +1730,7 @@ struct sctp_association *sctp_unpack_cookie( struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; + struct sctp_chunkhdr *ch; enum sctp_scope scope; unsigned int len; ktime_t kt; @@ -1759,6 +1760,10 @@ struct sctp_association *sctp_unpack_cookie( cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; + ch = (struct sctp_chunkhdr *)(bear_cookie + 1); + if (ntohs(ch->length) > len - fixed_size) + goto malformed; + /* Verify the cookie's MAC, if cookie authentication is enabled. */ if (sctp_sk(ep->base.sk)->cookie_auth_enable) { u8 mac[SHA256_DIGEST_SIZE]; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 8e89a870780c..9b23c11cbb9e 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2598,11 +2598,7 @@ static enum sctp_disposition sctp_sf_do_5_2_6_stale( */ sctp_add_cmd_sf(commands, SCTP_CMD_DEL_NON_PRIMARY, SCTP_NULL()); - /* If we've sent any data bundled with COOKIE-ECHO we will need to - * resend - */ - sctp_add_cmd_sf(commands, SCTP_CMD_T1_RETRAN, - SCTP_TRANSPORT(asoc->peer.primary_path)); + sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_OUTQUEUE, SCTP_NULL()); /* Cast away the const modifier, as we want to just * rerun it through as a sideffect. diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 58d0d9747f0b..66e12fb0c646 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1986,6 +1986,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_unlock; iov_iter_revert(&msg->msg_iter, err); + + /* sctp_sendmsg_to_asoc() may have released the socket + * lock (sctp_wait_for_sndbuf), during which other + * associations on ep->asocs could have been peeled + * off or freed. @asoc itself is revalidated by the + * base.dead and base.sk checks in sctp_wait_for_sndbuf, + * so re-derive the cached cursor from it. + */ + tmp = list_next_entry(asoc, asocs); } goto out_unlock; @@ -9394,6 +9403,8 @@ static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p) release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); + if (sk != asoc->base.sk) + goto do_error; *timeo_p = current_timeo; } diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 94bc9c7382ea..dea9270f3e57 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -21,6 +21,8 @@ #define NET_SHAPER_ID_UNSPEC NET_SHAPER_ID_MASK +static_assert(NET_SHAPER_ID_UNSPEC == NET_SHAPER_MAX_HANDLE_ID + 1); + struct net_shaper_hierarchy { struct xarray shapers; }; @@ -90,6 +92,12 @@ static int net_shaper_handle_size(void) nla_total_size(sizeof(u32))); } +static int net_shaper_group_reply_size(void) +{ + return nla_total_size(sizeof(u32)) + /* NET_SHAPER_A_IFINDEX */ + net_shaper_handle_size(); /* NET_SHAPER_A_HANDLE */ +} + static int net_shaper_fill_binding(struct sk_buff *msg, const struct net_shaper_binding *binding, u32 type) @@ -130,35 +138,58 @@ handle_nest_cancel: return -EMSGSIZE; } +static void net_shaper_copy(struct net_shaper *dst, + const struct net_shaper *src) +{ + WRITE_ONCE(dst->parent.scope, READ_ONCE(src->parent.scope)); + WRITE_ONCE(dst->parent.id, READ_ONCE(src->parent.id)); + WRITE_ONCE(dst->handle.scope, READ_ONCE(src->handle.scope)); + WRITE_ONCE(dst->handle.id, READ_ONCE(src->handle.id)); + + WRITE_ONCE(dst->metric, READ_ONCE(src->metric)); + WRITE_ONCE(dst->bw_min, READ_ONCE(src->bw_min)); + WRITE_ONCE(dst->bw_max, READ_ONCE(src->bw_max)); + WRITE_ONCE(dst->burst, READ_ONCE(src->burst)); + WRITE_ONCE(dst->priority, READ_ONCE(src->priority)); + WRITE_ONCE(dst->weight, READ_ONCE(src->weight)); + + /* private fields are only used on the write path under the lock */ + data_race(dst->leaves = src->leaves); +} + static int net_shaper_fill_one(struct sk_buff *msg, const struct net_shaper_binding *binding, const struct net_shaper *shaper, const struct genl_info *info) { + struct net_shaper cur; void *hdr; hdr = genlmsg_iput(msg, info); if (!hdr) return -EMSGSIZE; + /* Make a copy to avoid data races */ + net_shaper_copy(&cur, shaper); + if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) || - net_shaper_fill_handle(msg, &shaper->parent, + net_shaper_fill_handle(msg, &cur.parent, NET_SHAPER_A_PARENT) || - net_shaper_fill_handle(msg, &shaper->handle, + net_shaper_fill_handle(msg, &cur.handle, NET_SHAPER_A_HANDLE) || - ((shaper->bw_min || shaper->bw_max || shaper->burst) && - nla_put_u32(msg, NET_SHAPER_A_METRIC, shaper->metric)) || - (shaper->bw_min && - nla_put_uint(msg, NET_SHAPER_A_BW_MIN, shaper->bw_min)) || - (shaper->bw_max && - nla_put_uint(msg, NET_SHAPER_A_BW_MAX, shaper->bw_max)) || - (shaper->burst && - nla_put_uint(msg, NET_SHAPER_A_BURST, shaper->burst)) || - (shaper->priority && - nla_put_u32(msg, NET_SHAPER_A_PRIORITY, shaper->priority)) || - (shaper->weight && - nla_put_u32(msg, NET_SHAPER_A_WEIGHT, shaper->weight))) + ((cur.bw_min || cur.bw_max || cur.burst) && + nla_put_u32(msg, NET_SHAPER_A_METRIC, cur.metric)) || + (cur.bw_min && + nla_put_uint(msg, NET_SHAPER_A_BW_MIN, cur.bw_min)) || + (cur.bw_max && + nla_put_uint(msg, NET_SHAPER_A_BW_MAX, cur.bw_max)) || + (cur.burst && + nla_put_uint(msg, NET_SHAPER_A_BURST, cur.burst)) || + (cur.priority && + nla_put_u32(msg, NET_SHAPER_A_PRIORITY, cur.priority)) || + (cur.weight && + nla_put_u32(msg, NET_SHAPER_A_WEIGHT, cur.weight))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -275,25 +306,24 @@ static void net_shaper_default_parent(const struct net_shaper_handle *handle, parent->id = 0; } -/* - * MARK_0 is already in use due to XA_FLAGS_ALLOC, can't reuse such flag as - * it's cleared by xa_store(). - */ -#define NET_SHAPER_NOT_VALID XA_MARK_1 - static struct net_shaper * net_shaper_lookup(struct net_shaper_binding *binding, const struct net_shaper_handle *handle) { u32 index = net_shaper_handle_to_index(handle); struct net_shaper_hierarchy *hierarchy; + struct net_shaper *cur; hierarchy = net_shaper_hierarchy_rcu(binding); - if (!hierarchy || xa_get_mark(&hierarchy->shapers, index, - NET_SHAPER_NOT_VALID)) + if (!hierarchy) + return NULL; + + cur = xa_load(&hierarchy->shapers, index); + /* Check valid before reading fields */ + if (!cur || !smp_load_acquire(&cur->valid)) return NULL; - return xa_load(&hierarchy->shapers, index); + return cur; } /* Allocate on demand the per device shaper's hierarchy container. @@ -348,7 +378,7 @@ static int net_shaper_pre_insert(struct net_shaper_binding *binding, handle->id == NET_SHAPER_ID_UNSPEC) { u32 min, max; - handle->id = NET_SHAPER_ID_MASK - 1; + handle->id = NET_SHAPER_MAX_HANDLE_ID; max = net_shaper_handle_to_index(handle); handle->id = 0; min = net_shaper_handle_to_index(handle); @@ -370,13 +400,10 @@ static int net_shaper_pre_insert(struct net_shaper_binding *binding, goto free_id; } - /* Mark 'tentative' shaper inside the hierarchy container. - * xa_set_mark is a no-op if the previous store fails. + /* Insert as 'tentative' (no VALID mark). The mark will be set by + * net_shaper_commit() once the driver-side configuration succeeds. */ - xa_lock(&hierarchy->shapers); - prev = __xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL); - __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID); - xa_unlock(&hierarchy->shapers); + prev = xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL); if (xa_err(prev)) { NL_SET_ERR_MSG(extack, "Can't insert shaper into device store"); kfree_rcu(cur, rcu); @@ -410,12 +437,10 @@ static void net_shaper_commit(struct net_shaper_binding *binding, if (WARN_ON_ONCE(!cur)) continue; - /* Successful update: drop the tentative mark - * and update the hierarchy container. - */ - __xa_clear_mark(&hierarchy->shapers, index, - NET_SHAPER_NOT_VALID); - *cur = shapers[i]; + /* Successful update: update the hierarchy container... */ + net_shaper_copy(cur, &shapers[i]); + /* ... publish to lockless readers. */ + smp_store_release(&cur->valid, true); } xa_unlock(&hierarchy->shapers); } @@ -431,10 +456,11 @@ static void net_shaper_rollback(struct net_shaper_binding *binding) return; xa_lock(&hierarchy->shapers); - xa_for_each_marked(&hierarchy->shapers, index, cur, - NET_SHAPER_NOT_VALID) { + xa_for_each(&hierarchy->shapers, index, cur) { + if (cur->valid) + continue; __xa_erase(&hierarchy->shapers, index); - kfree(cur); + kfree_rcu(cur, rcu); } xa_unlock(&hierarchy->shapers); } @@ -465,10 +491,21 @@ static int net_shaper_parse_handle(const struct nlattr *attr, * shaper (any other value). */ id_attr = tb[NET_SHAPER_A_HANDLE_ID]; - if (id_attr) + if (id_attr) { id = nla_get_u32(id_attr); - else if (handle->scope == NET_SHAPER_SCOPE_NODE) + } else if (handle->scope == NET_SHAPER_SCOPE_NODE) { id = NET_SHAPER_ID_UNSPEC; + } else if (handle->scope == NET_SHAPER_SCOPE_QUEUE) { + NL_SET_ERR_ATTR_MISS(info->extack, attr, + NET_SHAPER_A_HANDLE_ID); + return -EINVAL; + } + + if (id && handle->scope == NET_SHAPER_SCOPE_NETDEV) { + NL_SET_ERR_MSG_ATTR(info->extack, id_attr, + "Netdev scope is a singleton, must use ID 0"); + return -EINVAL; + } handle->id = id; return 0; @@ -836,7 +873,12 @@ int net_shaper_nl_get_dumpit(struct sk_buff *skb, goto out_unlock; for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, - U32_MAX, XA_PRESENT)); ctx->start_index++) { + U32_MAX, XA_PRESENT)); + ctx->start_index++) { + /* Check valid before reading fields */ + if (!smp_load_acquire(&shaper->valid)) + continue; + ret = net_shaper_fill_one(skb, binding, shaper, info); if (ret) break; @@ -932,6 +974,46 @@ static int net_shaper_handle_cmp(const struct net_shaper_handle *a, return memcmp(a, b, sizeof(*a)); } +static int net_shaper_parse_leaves(struct net_shaper_binding *binding, + struct genl_info *info, + const struct net_shaper *node, + struct net_shaper *leaves, + int leaves_count) +{ + struct nlattr *attr; + int i, j, ret, rem; + + i = 0; + nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + if (WARN_ON_ONCE(i >= leaves_count)) + return -EINVAL; + + ret = net_shaper_parse_leaf(binding, attr, info, + node, &leaves[i]); + if (ret) + return ret; + + /* Reject duplicates */ + for (j = 0; j < i; j++) { + if (net_shaper_handle_cmp(&leaves[i].handle, + &leaves[j].handle)) + continue; + + NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr, + "Duplicate leaf shaper %d:%d", + leaves[i].handle.scope, + leaves[i].handle.id); + return -EINVAL; + } + + i++; + } + + return 0; +} + static int net_shaper_parent_from_leaves(int leaves_count, const struct net_shaper *leaves, struct net_shaper *node, @@ -964,15 +1046,22 @@ static int __net_shaper_group(struct net_shaper_binding *binding, int i, ret; if (node->handle.scope == NET_SHAPER_SCOPE_NODE) { + struct net_shaper *cur = NULL; + new_node = node->handle.id == NET_SHAPER_ID_UNSPEC; - if (!new_node && !net_shaper_lookup(binding, &node->handle)) { - /* The related attribute is not available when - * reaching here from the delete() op. - */ - NL_SET_ERR_MSG_FMT(extack, "Node shaper %d:%d does not exists", - node->handle.scope, node->handle.id); - return -ENOENT; + if (!new_node) { + cur = net_shaper_lookup(binding, &node->handle); + if (!cur) { + /* The related attribute is not available + * when reaching here from the delete() op. + */ + NL_SET_ERR_MSG_FMT(extack, + "Node shaper %d:%d does not exist", + node->handle.scope, + node->handle.id); + return -ENOENT; + } } /* When unspecified, the node parent scope is inherited from @@ -986,6 +1075,15 @@ static int __net_shaper_group(struct net_shaper_binding *binding, return ret; } + if (cur && net_shaper_handle_cmp(&cur->parent, + &node->parent)) { + NL_SET_ERR_MSG_FMT(extack, + "Cannot reparent node shaper %d:%d", + node->handle.scope, + node->handle.id); + return -EOPNOTSUPP; + } + } else { net_shaper_default_parent(&node->handle, &node->parent); } @@ -1162,7 +1260,7 @@ static int net_shaper_group_send_reply(struct net_shaper_binding *binding, free_msg: /* Should never happen as msg is pre-allocated with enough space. */ WARN_ONCE(true, "calculated message payload length (%d)", - net_shaper_handle_size()); + net_shaper_group_reply_size()); nlmsg_free(msg); return -EMSGSIZE; } @@ -1172,10 +1270,9 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) struct net_shaper **old_nodes, *leaves, node = {}; struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; - int i, ret, rem, leaves_count; + int i, ret, leaves_count; int old_nodes_count = 0; struct sk_buff *msg; - struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_LEAVES)) return -EINVAL; @@ -1203,26 +1300,19 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto free_leaves; - i = 0; - nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES, - genlmsg_data(info->genlhdr), - genlmsg_len(info->genlhdr), rem) { - if (WARN_ON_ONCE(i >= leaves_count)) - goto free_leaves; - - ret = net_shaper_parse_leaf(binding, attr, info, - &node, &leaves[i]); - if (ret) - goto free_leaves; - i++; - } + ret = net_shaper_parse_leaves(binding, info, &node, + leaves, leaves_count); + if (ret) + goto free_leaves; /* Prepare the msg reply in advance, to avoid device operation * rollback on allocation failure. */ - msg = genlmsg_new(net_shaper_handle_size(), GFP_KERNEL); - if (!msg) + msg = genlmsg_new(net_shaper_group_reply_size(), GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; goto free_leaves; + } hierarchy = net_shaper_hierarchy_setup(binding); if (!hierarchy) { diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c index 9b29be3ef19a..76eff85ec66d 100644 --- a/net/shaper/shaper_nl_gen.c +++ b/net/shaper/shaper_nl_gen.c @@ -11,10 +11,15 @@ #include <uapi/linux/net_shaper.h> +/* Integer value ranges */ +static const struct netlink_range_validation net_shaper_a_handle_id_range = { + .max = NET_SHAPER_MAX_HANDLE_ID, +}; + /* Common nested types */ const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1] = { [NET_SHAPER_A_HANDLE_SCOPE] = NLA_POLICY_MAX(NLA_U32, 3), - [NET_SHAPER_A_HANDLE_ID] = { .type = NLA_U32, }, + [NET_SHAPER_A_HANDLE_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &net_shaper_a_handle_id_range), }; const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1] = { diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h index 42c46c52c775..2406652a9014 100644 --- a/net/shaper/shaper_nl_gen.h +++ b/net/shaper/shaper_nl_gen.h @@ -12,6 +12,8 @@ #include <uapi/linux/net_shaper.h> +#define NET_SHAPER_MAX_HANDLE_ID 67108862 + /* Common nested types */ extern const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1]; extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1]; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1a565095376a..b5db69073e20 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -188,10 +188,12 @@ static bool smc_hs_congested(const struct sock *sk) struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), + .ht = HLIST_HEAD_INIT, }; struct smc_hashinfo smc_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), + .ht = HLIST_HEAD_INIT, }; int smc_hash_sk(struct sock *sk) @@ -1400,7 +1402,8 @@ smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc, int i; for (i = 0; i < ini->ism_offered_cnt + 1; i++) { - if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { + if (ini->ism_dev[i] && + ini->ism_chid[i] == ntohs(aclc->d1.chid)) { ini->ism_selected = i; return 0; } @@ -1628,12 +1631,8 @@ static void smc_connect_work(struct work_struct *work) lock_sock(&smc->sk); if (rc != 0 || smc->sk.sk_err) { smc->sk.sk_state = SMC_CLOSED; - if (rc == -EPIPE || rc == -EAGAIN) - smc->sk.sk_err = EPIPE; - else if (rc == -ECONNREFUSED) - smc->sk.sk_err = ECONNREFUSED; - else if (signal_pending(current)) - smc->sk.sk_err = -sock_intr_errno(timeo); + if (!smc->sk.sk_err) + smc->sk.sk_err = (rc == -EAGAIN) ? EPIPE : -rc; sock_put(&smc->sk); /* passive closing */ goto out; } @@ -3058,18 +3057,17 @@ static int __smc_setsockopt(struct socket *sock, int level, int optname, smc = smc_sk(sk); + /* pre-fetch user data outside the lock */ + if (optname == SMC_LIMIT_HS) { + if (optlen < sizeof(int)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; + } + lock_sock(sk); switch (optname) { case SMC_LIMIT_HS: - if (optlen < sizeof(int)) { - rc = -EINVAL; - break; - } - if (copy_from_sockptr(&val, optval, sizeof(int))) { - rc = -EFAULT; - break; - } - smc->limit_smc_hs = !!val; rc = 0; break; @@ -3521,8 +3519,6 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } - INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); - INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index a9a6e3c1113a..53da84f57fd6 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(smc_msg_event, __field(const void *, smc) __field(u64, net_cookie) __field(size_t, len) - __string(name, smc->conn.lnk->ibname) + __string(name, smc->conn.lnk ? smc->conn.lnk->ibname : "") ), TP_fast_assign( diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 7081c1214e6c..27dd6b58b8ff 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -403,7 +403,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) INIT_LIST_HEAD(&cd->readers); spin_lock_init(&cd->queue_lock); init_waitqueue_head(&cd->queue_wait); - cd->next_seqno = 0; + cd->next_seqno = 1; spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; @@ -1348,6 +1348,9 @@ static void *__cache_seq_start(struct seq_file *m, loff_t *pos) hash = n >> 32; entry = n & ((1LL<<32) - 1); + if (hash >= cd->hash_size) + return NULL; + hlist_for_each_entry_rcu(ch, &cd->hash_table[hash], cache_list) if (!entry--) return ch; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 798243eabb1f..964ebc268ee4 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -789,23 +789,33 @@ static int tls_push_record(struct sock *sk, int flags, i = msg_pl->sg.end; sk_msg_iter_var_prev(i); + /* msg_pl->sg.data is a ring; data[MAX+1] is reserved for the wrap + * link (frags won't use it). 'i' is now the last filled entry: + * + * i end start + * v v v [ rsv ] + * [ d ][ d ][ ][ ]...[ ][ d ][ d ][ d ][chain] + * ^ END v + * `-----------------------------------------' + * + * Note that SGL does not allow chain-after-chain, so for TLS 1.3, + * we must make sure we don't create the wrap entry and then chain + * link to content_type immediately at index 0. + */ + if (i < msg_pl->sg.start) + sg_chain(msg_pl->sg.data, ARRAY_SIZE(msg_pl->sg.data), + msg_pl->sg.data); + rec->content_type = record_type; if (prot->version == TLS_1_3_VERSION) { /* Add content type to end of message. No padding added */ sg_set_buf(&rec->sg_content_type, &rec->content_type, 1); sg_mark_end(&rec->sg_content_type); - sg_chain(msg_pl->sg.data, msg_pl->sg.end + 1, - &rec->sg_content_type); + sg_chain(msg_pl->sg.data, i + 2, &rec->sg_content_type); } else { sg_mark_end(sk_msg_elem(msg_pl, i)); } - if (msg_pl->sg.end < msg_pl->sg.start) { - sg_chain(&msg_pl->sg.data[msg_pl->sg.start], - MAX_SKB_FRAGS - msg_pl->sg.start + 1, - msg_pl->sg.data); - } - i = msg_pl->sg.start; sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]); @@ -1356,9 +1366,14 @@ unlock: mutex_unlock(&tls_ctx->tx_lock); } +/* When has_copied is true the caller has already moved bytes to + * userspace. Report sk_err but leave it set so the next read + * surfaces it instead of a spurious EOF, otherwise sk_err is + * consumed via sock_error(). + */ static int tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, - bool released) + bool released, bool has_copied) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -1376,8 +1391,11 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, if (!sk_psock_queue_empty(psock)) return 0; - if (sk->sk_err) + if (sk->sk_err) { + if (has_copied) + return -READ_ONCE(sk->sk_err); return sock_error(sk); + } if (ret < 0) return ret; @@ -1413,7 +1431,7 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, } if (unlikely(!tls_strp_msg_load(&ctx->strp, released))) - return tls_rx_rec_wait(sk, psock, nonblock, false); + return tls_rx_rec_wait(sk, psock, nonblock, false, has_copied); return 1; } @@ -2100,7 +2118,7 @@ int tls_sw_recvmsg(struct sock *sk, int to_decrypt, chunk; err = tls_rx_rec_wait(sk, psock, flags & MSG_DONTWAIT, - released); + released, !!(decrypted + copied)); if (err <= 0) { if (psock) { chunk = sk_msg_recvmsg(sk, psock, msg, len, @@ -2287,7 +2305,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct tls_decrypt_arg darg; err = tls_rx_rec_wait(sk, NULL, flags & SPLICE_F_NONBLOCK, - true); + true, false); if (err <= 0) goto splice_read_end; @@ -2317,9 +2335,9 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, if (copied < 0) goto splice_requeue; - if (chunk < rxm->full_len) { - rxm->offset += len; - rxm->full_len -= len; + if (copied < rxm->full_len) { + rxm->offset += copied; + rxm->full_len -= copied; goto splice_requeue; } @@ -2373,7 +2391,7 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, } else { struct tls_decrypt_arg darg; - err = tls_rx_rec_wait(sk, NULL, true, released); + err = tls_rx_rec_wait(sk, NULL, true, released, !!copied); if (err <= 0) goto read_sock_end; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e2d787ca3e74..0d9cd977c7b7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2711,8 +2711,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, - struct sk_buff *last, unsigned int last_len, - bool freezable) + struct sk_buff *last, bool freezable) { unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; struct sk_buff *tail; @@ -2725,7 +2724,6 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || - (tail && tail->len != last_len) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || @@ -2888,7 +2886,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) return -EAGAIN; } - WRITE_ONCE(u->inq_len, u->inq_len - skb->len); + WRITE_ONCE(u->inq_len, u->inq_len - unix_skb_len(skb)); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb == u->oob_skb) { @@ -2921,7 +2919,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, int flags = state->flags; bool check_creds = false; struct scm_cookie scm; - unsigned int last_len; struct unix_sock *u; int copied = 0; int err = 0; @@ -2967,7 +2964,6 @@ redo: goto unlock; } last = skb = skb_peek(&sk->sk_receive_queue); - last_len = last ? last->len : 0; again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) @@ -3001,8 +2997,7 @@ again: mutex_unlock(&u->iolock); - timeo = unix_stream_data_wait(sk, timeo, last, - last_len, freezable); + timeo = unix_stream_data_wait(sk, timeo, last, freezable); if (signal_pending(current)) { err = sock_intr_errno(timeo); @@ -3019,7 +3014,6 @@ unlock: while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; - last_len = skb->len; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; @@ -3069,11 +3063,12 @@ unlock: unix_detach_fds(&scm, skb); } - if (unix_skb_len(skb)) - break; - spin_lock(&sk->sk_receive_queue.lock); - WRITE_ONCE(u->inq_len, u->inq_len - skb->len); + WRITE_ONCE(u->inq_len, u->inq_len - chunk); + if (unix_skb_len(skb)) { + spin_unlock(&sk->sk_receive_queue.lock); + break; + } __skb_unlink(skb, &sk->sk_receive_queue); spin_unlock(&sk->sk_receive_queue.lock); @@ -3094,7 +3089,6 @@ unlock: skip = 0; last = skb; - last_len = skb->len; unix_state_lock(sk); skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb) @@ -3323,6 +3317,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct sk_buff *skb; int answ = 0; + if (sk->sk_type != SOCK_STREAM) + return -EOPNOTSUPP; + mutex_lock(&u->iolock); skb = skb_peek(&sk->sk_receive_queue); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index a7967a345827..0783555e2526 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -607,6 +607,8 @@ static void unix_gc(struct work_struct *work) struct sk_buff_head hitlist; struct sk_buff *skb; + WRITE_ONCE(gc_in_progress, true); + spin_lock(&unix_gc_lock); if (unix_graph_state == UNIX_GRAPH_NOT_CYCLIC) { @@ -649,10 +651,8 @@ void unix_schedule_gc(struct user_struct *user) READ_ONCE(user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) return; - if (!READ_ONCE(gc_in_progress)) { - WRITE_ONCE(gc_in_progress, true); + if (!READ_ONCE(gc_in_progress)) queue_work(system_dfl_wq, &unix_gc_work); - } if (user && READ_ONCE(unix_graph_cyclic_sccs)) flush_work(&unix_gc_work); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 44037b066a5f..2ce1063d4a67 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -642,7 +642,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) */ sock_reset_flag(sk, SOCK_DONE); sk->sk_state = TCP_CLOSE; - vsk->peer_shutdown = 0; + WRITE_ONCE(vsk->peer_shutdown, 0); } if (sk->sk_type == SOCK_SEQPACKET) { @@ -933,7 +933,7 @@ static struct sock *__vsock_create(struct net *net, vsk->rejected = false; vsk->sent_request = false; vsk->ignore_connecting_rst = false; - vsk->peer_shutdown = 0; + WRITE_ONCE(vsk->peer_shutdown, 0); INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); @@ -1241,6 +1241,25 @@ out: return err; } +static __poll_t vsock_poll_shutdown(struct sock *sk, u32 peer_shutdown) +{ + __poll_t mask = 0; + + /* INET sockets treat local write shutdown and peer write shutdown as a + * case of EPOLLHUP set. + */ + if (sk->sk_shutdown == SHUTDOWN_MASK || + ((sk->sk_shutdown & SEND_SHUTDOWN) && + (peer_shutdown & SEND_SHUTDOWN))) + mask |= EPOLLHUP; + + if (sk->sk_shutdown & RCV_SHUTDOWN || + peer_shutdown & SEND_SHUTDOWN) + mask |= EPOLLRDHUP; + + return mask; +} + static __poll_t vsock_poll(struct file *file, struct socket *sock, poll_table *wait) { @@ -1258,24 +1277,17 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, /* Signify that there has been an error on this socket. */ mask |= EPOLLERR; - /* INET sockets treat local write shutdown and peer write shutdown as a - * case of EPOLLHUP set. - */ - if ((sk->sk_shutdown == SHUTDOWN_MASK) || - ((sk->sk_shutdown & SEND_SHUTDOWN) && - (vsk->peer_shutdown & SEND_SHUTDOWN))) { - mask |= EPOLLHUP; - } - - if (sk->sk_shutdown & RCV_SHUTDOWN || - vsk->peer_shutdown & SEND_SHUTDOWN) { - mask |= EPOLLRDHUP; - } - if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; if (sock->type == SOCK_DGRAM) { + u32 peer_shutdown = READ_ONCE(vsk->peer_shutdown); + + /* DGRAM sockets do not take lock_sock() in poll(), so use one + * lockless snapshot for all shutdown-derived mask bits. + */ + mask |= vsock_poll_shutdown(sk, peer_shutdown); + /* For datagram sockets we can read if there is something in * the queue and write as long as the socket isn't shutdown for * sending. @@ -1290,6 +1302,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, } else if (sock_type_connectible(sk->sk_type)) { const struct vsock_transport *transport; + u32 peer_shutdown; lock_sock(sk); @@ -1322,8 +1335,10 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, * terminated should also be considered read, and we check the * shutdown flag for that. */ + peer_shutdown = READ_ONCE(vsk->peer_shutdown); + mask |= vsock_poll_shutdown(sk, peer_shutdown); if (sk->sk_shutdown & RCV_SHUTDOWN || - vsk->peer_shutdown & SEND_SHUTDOWN) { + peer_shutdown & SEND_SHUTDOWN) { mask |= EPOLLIN | EPOLLRDNORM; } diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 7a8963595bf9..b3394946b2ed 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -264,7 +264,7 @@ static void hvs_do_close_lock_held(struct vsock_sock *vsk, struct sock *sk = sk_vsock(vsk); sock_set_flag(sk, SOCK_DONE); - vsk->peer_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK); if (vsock_stream_has_data(vsk) <= 0) sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); @@ -593,7 +593,9 @@ static int hvs_update_recv_data(struct hvsock *hvs) return -EIO; if (payload_len == 0) - hvs->vsk->peer_shutdown |= SEND_SHUTDOWN; + WRITE_ONCE(hvs->vsk->peer_shutdown, + READ_ONCE(hvs->vsk->peer_shutdown) | + SEND_SHUTDOWN); hvs->recv_data_len = payload_len; hvs->recv_data_off = 0; @@ -736,7 +738,8 @@ static s64 hvs_stream_has_data(struct vsock_sock *vsk) return ret; return hvs->recv_data_len; case 0: - vsk->peer_shutdown |= SEND_SHUTDOWN; + WRITE_ONCE(vsk->peer_shutdown, + READ_ONCE(vsk->peer_shutdown) | SEND_SHUTDOWN); ret = 0; break; default: /* -1 */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 416d533f493d..b10666937c49 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -70,34 +70,6 @@ static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops, return true; } -static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, - struct sk_buff *skb, - struct msghdr *msg, - size_t pkt_len, - bool zerocopy) -{ - struct ubuf_info *uarg; - - if (msg->msg_ubuf) { - uarg = msg->msg_ubuf; - net_zcopy_get(uarg); - } else { - struct ubuf_info_msgzc *uarg_zc; - - uarg = msg_zerocopy_realloc(sk_vsock(vsk), - pkt_len, NULL, false); - if (!uarg) - return -1; - - uarg_zc = uarg_to_msgzc(uarg); - uarg_zc->zerocopy = zerocopy ? 1 : 0; - } - - skb_zcopy_init(skb, uarg); - - return 0; -} - static int virtio_transport_fill_skb(struct sk_buff *skb, struct virtio_vsock_pkt_info *info, size_t len, @@ -136,27 +108,6 @@ static void virtio_transport_init_hdr(struct sk_buff *skb, hdr->fwd_cnt = cpu_to_le32(0); } -static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb, - void *dst, - size_t len) -{ - struct iov_iter iov_iter = { 0 }; - struct kvec kvec; - size_t to_copy; - - kvec.iov_base = dst; - kvec.iov_len = len; - - iov_iter.iter_type = ITER_KVEC; - iov_iter.kvec = &kvec; - iov_iter.nr_segs = 1; - - to_copy = min_t(size_t, len, skb->len); - - skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset, - &iov_iter, to_copy); -} - /* Packet capture */ static struct sk_buff *virtio_transport_build_skb(void *opaque) { @@ -166,12 +117,12 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) struct sk_buff *skb; size_t payload_len; - /* A packet could be split to fit the RX buffer, so we can retrieve - * the payload length from the header and the buffer pointer taking - * care of the offset in the original packet. + /* A packet could be split to fit the RX buffer, so we use + * the payload length from the header, which has been updated + * by the sender to reflect the fragment size. */ pkt_hdr = virtio_vsock_hdr(pkt); - payload_len = pkt->len; + payload_len = le32_to_cpu(pkt_hdr->len); skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len, GFP_ATOMIC); @@ -214,12 +165,18 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr)); if (payload_len) { - if (skb_is_nonlinear(pkt)) { - void *data = skb_put(skb, payload_len); - - virtio_transport_copy_nonlinear_skb(pkt, data, payload_len); - } else { - skb_put_data(skb, pkt->data, payload_len); + struct iov_iter iov_iter; + struct kvec kvec; + void *data = skb_put(skb, payload_len); + + kvec.iov_base = data; + kvec.iov_len = payload_len; + iov_iter_kvec(&iov_iter, ITER_DEST, &kvec, 1, payload_len); + + if (skb_copy_datagram_iter(pkt, VIRTIO_VSOCK_SKB_CB(pkt)->offset, + &iov_iter, payload_len)) { + kfree_skb(skb); + return NULL; } } @@ -248,6 +205,7 @@ static u16 virtio_transport_get_type(struct sock *sk) static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, size_t payload_len, bool zcopy, + struct ubuf_info *uarg, u32 src_cid, u32 src_port, u32 dst_cid, @@ -288,6 +246,12 @@ static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info * if (info->msg && payload_len > 0) { int err; + /* Bind the zerocopy lifetime before filling frags so error + * rollback frees managed fixed-buffer pages through + * the uarg-aware path. + */ + skb_zcopy_set(skb, uarg, NULL); + err = virtio_transport_fill_skb(skb, info, payload_len, zcopy); if (err) goto out; @@ -332,8 +296,10 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, u32 src_cid, src_port, dst_cid, dst_port; const struct virtio_transport *t_ops; struct virtio_vsock_sock *vvs; + struct ubuf_info *uarg = NULL; u32 pkt_len = info->pkt_len; bool can_zcopy = false; + bool have_uref = false; u32 rest_len; int ret; @@ -375,6 +341,25 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, if (can_zcopy) max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, (MAX_SKB_FRAGS * PAGE_SIZE)); + + if (info->msg->msg_flags & MSG_ZEROCOPY && + info->op == VIRTIO_VSOCK_OP_RW) { + uarg = info->msg->msg_ubuf; + + if (!uarg) { + uarg = msg_zerocopy_realloc(sk_vsock(vsk), + pkt_len, NULL, false); + if (!uarg) { + virtio_transport_put_credit(vvs, pkt_len); + return -ENOMEM; + } + + if (!can_zcopy) + uarg_to_msgzc(uarg)->zerocopy = 0; + + have_uref = true; + } + } } rest_len = pkt_len; @@ -386,6 +371,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, skb_len = min(max_skb_len, rest_len); skb = virtio_transport_alloc_skb(info, skb_len, can_zcopy, + uarg, src_cid, src_port, dst_cid, dst_port); if (!skb) { @@ -393,28 +379,6 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, break; } - /* We process buffer part by part, allocating skb on - * each iteration. If this is last skb for this buffer - * and MSG_ZEROCOPY mode is in use - we must allocate - * completion for the current syscall. - * - * Pass pkt_len because msg iter is already consumed - * by virtio_transport_fill_skb(), so iter->count - * can not be used for RLIMIT_MEMLOCK pinned-pages - * accounting done by msg_zerocopy_realloc(). - */ - if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY && - skb_len == rest_len && info->op == VIRTIO_VSOCK_OP_RW) { - if (virtio_transport_init_zcopy_skb(vsk, skb, - info->msg, - pkt_len, - can_zcopy)) { - kfree_skb(skb); - ret = -ENOMEM; - break; - } - } - virtio_transport_inc_tx_pkt(vvs, skb); ret = t_ops->send_pkt(skb, info->net); @@ -437,6 +401,18 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, virtio_transport_put_credit(vvs, rest_len); + /* msg_zerocopy_realloc() initializes the ubuf_info refcnt to 1. + * skb_zcopy_set() increases it for each skb, so we can drop that + * initial reference to keep it balanced. + */ + if (have_uref) { + if (rest_len == pkt_len) + /* No data sent, abort the notification. */ + net_zcopy_put_abort(uarg, true); + else + net_zcopy_put(uarg); + } + /* Return number of bytes, if any data has been sent. */ if (rest_len != pkt_len) ret = pkt_len - rest_len; @@ -447,7 +423,16 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len) { - if (vvs->buf_used + len > vvs->buf_alloc) + u64 skb_overhead = ((u64)skb_queue_len(&vvs->rx_queue) + 1) * SKB_TRUESIZE(0); + + /* Allow at most buf_alloc * 2 total budget (payload + overhead), + * similar to how SO_RCVBUF is doubled to reserve space for sk_buff + * metadata. Check payload against buf_alloc to be sure the other + * peer is respecting the credit, and sk_buff overhead to bound + * queue growth. + */ + if ((u64)vvs->buf_used + len > vvs->buf_alloc || + skb_overhead > vvs->buf_alloc) return false; vvs->rx_bytes += len; @@ -1204,7 +1189,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, if (!t) return -ENOTCONN; - reply = virtio_transport_alloc_skb(&info, 0, false, + reply = virtio_transport_alloc_skb(&info, 0, false, NULL, le64_to_cpu(hdr->dst_cid), le32_to_cpu(hdr->dst_port), le64_to_cpu(hdr->src_cid), @@ -1249,7 +1234,7 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, struct sock *sk = sk_vsock(vsk); sock_set_flag(sk, SOCK_DONE); - vsk->peer_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK); if (vsock_stream_has_data(vsk) <= 0) sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); @@ -1363,7 +1348,7 @@ destroy: return err; } -static void +static bool virtio_transport_recv_enqueue(struct vsock_sock *vsk, struct sk_buff *skb) { @@ -1378,10 +1363,8 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, spin_lock_bh(&vvs->rx_lock); can_enqueue = virtio_transport_inc_rx_pkt(vvs, len); - if (!can_enqueue) { - free_pkt = true; + if (!can_enqueue) goto out; - } if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count++; @@ -1421,6 +1404,8 @@ out: spin_unlock_bh(&vvs->rx_lock); if (free_pkt) kfree_skb(skb); + + return can_enqueue; } static int @@ -1433,7 +1418,17 @@ virtio_transport_recv_connected(struct sock *sk, switch (le16_to_cpu(hdr->op)) { case VIRTIO_VSOCK_OP_RW: - virtio_transport_recv_enqueue(vsk, skb); + if (!virtio_transport_recv_enqueue(vsk, skb)) { + /* There is no more space to queue the packet, so let's + * close the connection; otherwise, we'll lose data. + */ + (void)virtio_transport_reset(vsk, skb); + virtio_transport_do_close(vsk, true); + sk->sk_err = ENOBUFS; + sk_error_report(sk); + vsock_remove_sock(vsk); + break; + } vsock_data_ready(sk); return err; case VIRTIO_VSOCK_OP_CREDIT_REQUEST: @@ -1442,12 +1437,15 @@ virtio_transport_recv_connected(struct sock *sk, case VIRTIO_VSOCK_OP_CREDIT_UPDATE: sk->sk_write_space(sk); break; - case VIRTIO_VSOCK_OP_SHUTDOWN: + case VIRTIO_VSOCK_OP_SHUTDOWN: { + u32 peer_shutdown = READ_ONCE(vsk->peer_shutdown); + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) - vsk->peer_shutdown |= RCV_SHUTDOWN; + peer_shutdown |= RCV_SHUTDOWN; if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) - vsk->peer_shutdown |= SEND_SHUTDOWN; - if (vsk->peer_shutdown == SHUTDOWN_MASK) { + peer_shutdown |= SEND_SHUTDOWN; + WRITE_ONCE(vsk->peer_shutdown, peer_shutdown); + if (peer_shutdown == SHUTDOWN_MASK) { if (vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, SOCK_DONE)) { (void)virtio_transport_reset(vsk, NULL); virtio_transport_do_close(vsk, true); @@ -1462,6 +1460,7 @@ virtio_transport_recv_connected(struct sock *sk, if (le32_to_cpu(virtio_vsock_hdr(skb)->flags)) sk->sk_state_change(sk); break; + } case VIRTIO_VSOCK_OP_RST: virtio_transport_do_close(vsk, true); break; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 4296ca1183f1..91516488a742 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -819,7 +819,7 @@ static void vmci_transport_handle_detach(struct sock *sk) /* On a detach the peer will not be sending or receiving * anymore. */ - vsk->peer_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK); /* We should not be sending anymore since the peer won't be * there to receive, but we can still receive if there is data @@ -980,8 +980,10 @@ static int vmci_transport_recv_listen(struct sock *sk, err = -EINVAL; } - if (err < 0) + if (err < 0) { vsock_remove_pending(sk, pending); + sk_acceptq_removed(sk); + } release_sock(pending); vmci_transport_release_pending(pending); @@ -1164,7 +1166,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, /* Close and cleanup the connection. */ vmci_transport_send_reset(pending, pkt); skerr = EPROTO; - err = pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST ? 0 : -EINVAL; + err = -EINVAL; goto destroy; } @@ -1542,7 +1544,9 @@ static int vmci_transport_recv_connected(struct sock *sk, if (pkt->u.mode) { vsk = vsock_sk(sk); - vsk->peer_shutdown |= pkt->u.mode; + WRITE_ONCE(vsk->peer_shutdown, + READ_ONCE(vsk->peer_shutdown) | + pkt->u.mode); sk->sk_state_change(sk); } break; @@ -1559,7 +1563,7 @@ static int vmci_transport_recv_connected(struct sock *sk, * a clean shutdown. */ sock_set_flag(sk, SOCK_DONE); - vsk->peer_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK); if (vsock_stream_has_data(vsk) <= 0) sk->sk_state = TCP_CLOSING; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f334cdef8958..76c537a6e8b5 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1276,6 +1276,18 @@ static int nl80211_prepare_wdev_dump(struct netlink_callback *cb, rtnl_unlock(); return -ENODEV; } + + /* + * The first invocation validated the wdev's netns against + * the caller via __cfg80211_wdev_from_attrs(). The wiphy + * may have moved netns between dumpit invocations (via + * NL80211_CMD_SET_WIPHY_NETNS), so re-check here. + */ + if (!net_eq(wiphy_net(wiphy), sock_net(cb->skb->sk))) { + rtnl_unlock(); + return -ENODEV; + } + *rdev = wiphy_to_rdev(wiphy); *wdev = NULL; @@ -6354,6 +6366,9 @@ nl80211_parse_rnr_elems(struct wiphy *wiphy, struct nlattr *attrs, if (ret) return ERR_PTR(ret); + if (num_elems >= 255) + return ERR_PTR(-EINVAL); + num_elems++; } @@ -6699,6 +6714,12 @@ static int nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) return -EINVAL; } + if (!!params->he_cap != !!params->he_oper) + return -EINVAL; + + if (!!params->eht_cap != !!params->eht_oper) + return -EINVAL; + return 0; } @@ -13867,6 +13888,19 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(net)) return PTR_ERR(net); + /* + * The caller already has CAP_NET_ADMIN over the source netns + * (enforced by GENL_UNS_ADMIN_PERM on the genl op). Mirror the + * convention used by net/core/rtnetlink.c::rtnl_get_net_ns_capable() + * and require CAP_NET_ADMIN over the target netns as well, so that + * a caller that is privileged in their own user namespace cannot + * push a wiphy into a netns where they have no privilege. + */ + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return -EPERM; + } + err = 0; /* check if anything to do */ @@ -19828,6 +19862,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .cmd = NL80211_CMD_SET_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_pmk, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_CLEAR_SKB), }, @@ -19835,6 +19870,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .cmd = NL80211_CMD_DEL_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_pmk, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 4c8ea0583f94..d6cd0de64d1f 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -88,7 +88,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, out->ftm.ftms_per_burst = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]) out->ftm.ftms_per_burst = - nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]); + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]); if (capa->ftm.max_ftms_per_burst && (out->ftm.ftms_per_burst > capa->ftm.max_ftms_per_burst || diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 328af43ef832..27a56ee2e8f0 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1071,6 +1071,7 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev) struct cfg80211_scan_request_int *request; struct cfg80211_scan_request_int *rdev_req = rdev->scan_req; u32 n_channels = 0, idx, i; + int err; if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) { rdev_req->req.first_part = true; @@ -1100,8 +1101,14 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev) rdev_req->req.scan_6ghz = false; rdev_req->req.first_part = true; + err = rdev_scan(rdev, request); + if (err) { + kfree(request); + return err; + } + rdev->int_scan_req = request; - return rdev_scan(rdev, request); + return 0; } void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, @@ -2462,6 +2469,9 @@ size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, memcpy(merged_ie + copied_len, next_sub->data, next_sub->datalen); copied_len += next_sub->datalen; + + mbssid_elem = next_mbssid; + sub_elem = next_sub; } return copied_len; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 22d9d9bae8f5..63d145b524c9 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -789,6 +789,8 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); if (!chandef.chan) return -EINVAL; + if (!cfg80211_chandef_valid(&chandef)) + return -EINVAL; return cfg80211_set_monitor_channel(rdev, dev, &chandef); case NL80211_IFTYPE_MESH_POINT: freq = cfg80211_wext_freq(wextfreq); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 887abed25466..f8c8a8c9dfba 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -646,9 +646,42 @@ static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); } -static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) +static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr) { - skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); + struct xsk_addrs *xsk_addr; + + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (unlikely(!xsk_addr)) + return NULL; + + xsk_addr->addrs[0] = addr; + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; + return xsk_addr; +} + +static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb) +{ + struct xsk_addrs *xsk_addr; + + if (!xsk_skb_destructor_is_addr(skb)) + return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + + xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb)); + if (likely(xsk_addr)) + xsk_addr->num_descs = 1; + return xsk_addr; +} + +static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) +{ + if (IS_ENABLED(CONFIG_64BIT)) { + skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); + return 0; + } + + if (unlikely(!__xsk_addrs_alloc(skb, addr))) + return -ENOMEM; + return 0; } static void xsk_inc_num_desc(struct sk_buff *skb) @@ -685,7 +718,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, spin_lock_irqsave(&pool->cq_prod_lock, flags); idx = xskq_get_prod(pool->cq); - if (unlikely(num_descs > 1)) { + if (unlikely(!xsk_skb_destructor_is_addr(skb))) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; for (i = 0; i < num_descs; i++) { @@ -724,14 +757,20 @@ void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } -static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, - u64 addr) +static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, + u64 addr) { + int err; + + err = xsk_skb_destructor_set_addr(skb, addr); + if (unlikely(err)) + return err; + skb->dev = xs->dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; - xsk_skb_destructor_set_addr(skb, addr); + return 0; } static void xsk_consume_skb(struct sk_buff *skb) @@ -740,7 +779,7 @@ static void xsk_consume_skb(struct sk_buff *skb) u32 num_descs = xsk_get_num_desc(skb); struct xsk_addrs *xsk_addr; - if (unlikely(num_descs > 1)) { + if (unlikely(!xsk_skb_destructor_is_addr(skb))) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; kmem_cache_free(xsk_tx_generic_cache, xsk_addr); } @@ -763,6 +802,7 @@ static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, u32 hr) { struct xsk_tx_metadata *meta = NULL; + u16 csum_start, csum_offset; if (unlikely(pool->tx_metadata_len == 0)) return -EINVAL; @@ -772,13 +812,15 @@ static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, return -EINVAL; if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { - if (unlikely(meta->request.csum_start + - meta->request.csum_offset + + csum_start = READ_ONCE(meta->request.csum_start); + csum_offset = READ_ONCE(meta->request.csum_offset); + + if (unlikely(csum_start + csum_offset + sizeof(__sum16) > desc->len)) return -EINVAL; - skb->csum_start = hr + meta->request.csum_start; - skb->csum_offset = meta->request.csum_offset; + skb->csum_start = hr + csum_start; + skb->csum_offset = csum_offset; skb->ip_summed = CHECKSUM_PARTIAL; if (unlikely(pool->tx_sw_csum)) { @@ -819,28 +861,19 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, return ERR_PTR(err); skb_reserve(skb, hr); - - xsk_skb_init_misc(skb, xs, desc->addr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, pool, hr); - if (unlikely(err)) + if (unlikely(err)) { + kfree_skb(skb); return ERR_PTR(err); + } } } else { struct xsk_addrs *xsk_addr; - if (xsk_skb_destructor_is_addr(skb)) { - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, - GFP_KERNEL); - if (!xsk_addr) - return ERR_PTR(-ENOMEM); - - xsk_addr->num_descs = 1; - xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); - skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; - } else { - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; - } + xsk_addr = xsk_addrs_alloc(skb); + if (!xsk_addr) + return ERR_PTR(-ENOMEM); /* in case of -EOVERFLOW that could happen below, * xsk_consume_skb() will release this node as whole skb @@ -856,8 +889,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, addr = buffer - pool->addrs; for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { - if (unlikely(i >= MAX_SKB_FRAGS)) + if (unlikely(i >= MAX_SKB_FRAGS)) { + if (!xs->skb) + kfree_skb(skb); return ERR_PTR(-EOVERFLOW); + } page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); @@ -914,7 +950,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (unlikely(err)) goto free_err; - xsk_skb_init_misc(skb, xs, desc->addr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, xs->pool, hr); @@ -927,19 +962,10 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct page *page; u8 *vaddr; - if (xsk_skb_destructor_is_addr(skb)) { - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, - GFP_KERNEL); - if (!xsk_addr) { - err = -ENOMEM; - goto free_err; - } - - xsk_addr->num_descs = 1; - xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); - skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; - } else { - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + xsk_addr = xsk_addrs_alloc(skb); + if (!xsk_addr) { + err = -ENOMEM; + goto free_err; } if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { @@ -964,18 +990,28 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, } } + if (!xs->skb) { + err = xsk_skb_init_misc(skb, xs, desc->addr); + if (unlikely(err)) + goto free_err; + } xsk_inc_num_desc(skb); return skb; free_err: - if (skb && !skb_shinfo(skb)->nr_frags) + if (skb && !xs->skb) kfree_skb(skb); if (err == -EOVERFLOW) { - /* Drop the packet */ - xsk_inc_num_desc(xs->skb); - xsk_drop_skb(xs->skb); + if (xs->skb) { + /* Drop the packet */ + xsk_inc_num_desc(xs->skb); + xsk_drop_skb(xs->skb); + } else { + xsk_cq_cancel_locked(xs->pool, 1); + xs->tx->invalid_descs++; + } xskq_cons_release(xs->tx); } else { /* Let application retry */ diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index cd7bc50872f6..d981cfdd8535 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -175,6 +175,9 @@ int xp_assign_dev(struct xsk_buff_pool *pool, if (force_zc && force_copy) return -EINVAL; + if (pool->tx_sw_csum && (netdev->priv_flags & IFF_TX_SKB_NO_LINEAR)) + return -EOPNOTSUPP; + if (xsk_get_pool_from_qid(netdev, queue_id)) return -EBUSY; diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index afa457506274..3bff346308d0 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -184,6 +184,10 @@ static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value, } xs = (struct xdp_sock *)sock->sk; + if (!READ_ONCE(xs->rx)) { + sockfd_put(sock); + return -ENOBUFS; + } map_entry = &m->xsk_map[i]; node = xsk_map_node_alloc(m, map_entry); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index f65291eba1f6..e4c2cd24936d 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -797,9 +797,12 @@ static void xfrm_trans_reinject(struct work_struct *work) spin_unlock_bh(&trans->queue_lock); local_bh_disable(); - while ((skb = __skb_dequeue(&queue))) - XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net, - NULL, skb); + while ((skb = __skb_dequeue(&queue))) { + struct net *net = XFRM_TRANS_SKB_CB(skb)->net; + + XFRM_TRANS_SKB_CB(skb)->finish(net, NULL, skb); + put_net(net); + } local_bh_enable(); } @@ -808,6 +811,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, struct sk_buff *)) { struct xfrm_trans_tasklet *trans; + struct net *hold_net; trans = this_cpu_ptr(&xfrm_trans_tasklet); @@ -816,8 +820,12 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); + hold_net = maybe_get_net(net); + if (!hold_net) + return -ENODEV; + XFRM_TRANS_SKB_CB(skb)->finish = finish; - XFRM_TRANS_SKB_CB(skb)->net = net; + XFRM_TRANS_SKB_CB(skb)->net = hold_net; spin_lock_bh(&trans->queue_lock); __skb_queue_tail(&trans->queue, skb); spin_unlock_bh(&trans->queue_lock); diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 5f38dff16177..671d48f8c937 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -51,11 +51,15 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen) struct scatterlist *dsg; int len, dlen; - if (unlikely(err)) - goto out_free_req; + if (unlikely(!req)) + return err; extra = acomp_request_extra(req); dsg = extra->sg; + + if (unlikely(err)) + goto out_free_req; + dlen = req->dlen; pskb_trim_unique(skb, 0); @@ -84,10 +88,10 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen) skb_shinfo(skb)->nr_frags++; } while ((dlen -= len)); - for (; dsg; dsg = sg_next(dsg)) +out_free_req: + for (; dsg && sg_page(dsg); dsg = sg_next(dsg)) __free_page(sg_page(dsg)); -out_free_req: acomp_request_free(req); return err; } diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 97bc979e55ba..6c6bbc040517 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -2650,7 +2650,8 @@ static void __iptfs_init_state(struct xfrm_state *x, x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr); /* Always keep a module reference when x->mode_data is set */ - __module_get(x->mode_cbs->owner); + if (x->mode_data != xtfs) + __module_get(x->mode_cbs->owner); x->mode_data = xtfs; xtfs->x = x; @@ -2658,22 +2659,39 @@ static void __iptfs_init_state(struct xfrm_state *x, static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig) { + struct skb_wseq *w_saved = NULL; struct xfrm_iptfs_data *xtfs; xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL); if (!xtfs) return -ENOMEM; - xtfs->ra_newskb = NULL; if (xtfs->cfg.reorder_win_size) { - xtfs->w_saved = kzalloc_objs(*xtfs->w_saved, - xtfs->cfg.reorder_win_size); - if (!xtfs->w_saved) { + w_saved = kzalloc_objs(*w_saved, xtfs->cfg.reorder_win_size); + if (!w_saved) { kfree_sensitive(xtfs); return -ENOMEM; } } + xtfs->w_saved = w_saved; + + __skb_queue_head_init(&xtfs->queue); + xtfs->queue_size = 0; + hrtimer_setup(&xtfs->iptfs_timer, iptfs_delay_timer, CLOCK_MONOTONIC, + IPTFS_HRTIMER_MODE); + + spin_lock_init(&xtfs->drop_lock); + hrtimer_setup(&xtfs->drop_timer, iptfs_drop_timer, CLOCK_MONOTONIC, + IPTFS_HRTIMER_MODE); + xtfs->w_seq_set = false; + xtfs->w_wantseq = 0; + xtfs->w_savedlen = 0; + xtfs->ra_newskb = NULL; + xtfs->ra_wantseq = 0; + xtfs->ra_runtlen = 0; + + __module_get(x->mode_cbs->owner); x->mode_data = xtfs; xtfs->x = x; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index a9652b422f51..cc35c2fcbbe0 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -66,7 +66,9 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph = ip_hdr(skb); int ihl = iph->ihl * 4; - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) + skb_set_inner_transport_header(skb, + skb_transport_offset(skb)); skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + @@ -167,7 +169,9 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) int hdr_len; iph = ipv6_hdr(skb); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) + skb_set_inner_transport_header(skb, + skb_transport_offset(skb)); hdr_len = xfrm6_hdr_offset(x, skb, &prevhdr); if (hdr_len < 0) @@ -276,8 +280,10 @@ static int xfrm4_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *top_iph; int flags; - skb_set_inner_network_header(skb, skb_network_offset(skb)); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) { + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + } skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + @@ -321,8 +327,10 @@ static int xfrm6_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) struct ipv6hdr *top_iph; int dsfield; - skb_set_inner_network_header(skb, skb_network_offset(skb)); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) { + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + } skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c944327ce66c..dd09d2063da2 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4276,21 +4276,21 @@ out_byidx: return -ENOMEM; } -static void xfrm_policy_fini(struct net *net) +static void __net_exit xfrm_net_pre_exit(struct net *net) { - struct xfrm_pol_inexact_bin *b, *t; - unsigned int sz; - int dir; - disable_work_sync(&net->xfrm.policy_hthresh.work); - flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false); #endif xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false); +} - synchronize_rcu(); +static void xfrm_policy_fini(struct net *net) +{ + struct xfrm_pol_inexact_bin *b, *t; + unsigned int sz; + int dir; WARN_ON(!list_empty(&net->xfrm.policy_all)); @@ -4368,6 +4368,7 @@ static void __net_exit xfrm_net_exit(struct net *net) static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, + .pre_exit = xfrm_net_pre_exit, .exit = xfrm_net_exit, }; @@ -4703,7 +4704,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 5 - announce */ - km_migrate(sel, dir, type, m, num_migrate, k, encap); + km_migrate(sel, dir, type, m, num_migrate, k, net, encap); xfrm_pol_put(pol); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1748d374abca..589c3b6e4679 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -818,17 +818,17 @@ int __xfrm_state_delete(struct xfrm_state *x) spin_lock(&net->xfrm.xfrm_state_lock); list_del(&x->km.all); - hlist_del_rcu(&x->bydst); - hlist_del_rcu(&x->bysrc); - if (x->km.seq) - hlist_del_rcu(&x->byseq); + hlist_del_init_rcu(&x->bydst); + hlist_del_init_rcu(&x->bysrc); + if (!hlist_unhashed(&x->byseq)) + hlist_del_init_rcu(&x->byseq); if (!hlist_unhashed(&x->state_cache)) hlist_del_rcu(&x->state_cache); if (!hlist_unhashed(&x->state_cache_input)) hlist_del_rcu(&x->state_cache_input); - if (x->id.spi) - hlist_del_rcu(&x->byspi); + if (!hlist_unhashed(&x->byspi)) + hlist_del_init_rcu(&x->byspi); net->xfrm.state_num--; xfrm_nat_keepalive_state_updated(x); spin_unlock(&net->xfrm.xfrm_state_lock); @@ -2837,7 +2837,7 @@ EXPORT_SYMBOL(km_policy_expired); #ifdef CONFIG_XFRM_MIGRATE int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { int err = -EINVAL; @@ -2848,7 +2848,7 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->migrate) { ret = km->migrate(sel, dir, type, m, num_migrate, k, - encap); + net, encap); if (!ret) err = ret; } @@ -3114,10 +3114,14 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; u32 blksize, net_adj = 0; + u32 overhead, payload_mtu; if (x->km.state != XFRM_STATE_VALID || - !type || type->proto != IPPROTO_ESP) + !type || type->proto != IPPROTO_ESP) { + if (mtu <= x->props.header_len) + return 1; return mtu - x->props.header_len; + } aead = x->data; blksize = ALIGN(crypto_aead_blocksize(aead), 4); @@ -3140,8 +3144,17 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) break; } - return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - - net_adj) & ~(blksize - 1)) + net_adj - 2; + overhead = x->props.header_len + crypto_aead_authsize(aead) + net_adj; + if (mtu <= overhead) + return 1; + + payload_mtu = mtu - overhead; + payload_mtu &= ~(blksize - 1); + if (payload_mtu <= 2) + return 1; + + return payload_mtu + net_adj - 2; + } EXPORT_SYMBOL_GPL(xfrm_state_mtu); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d56450f61669..71a4b7278eba 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3271,10 +3271,9 @@ out_cancel: static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { - struct net *net = &init_net; struct sk_buff *skb; int err; @@ -3292,7 +3291,7 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, #else static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; @@ -3323,6 +3322,7 @@ const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_MAPPING - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping), [XFRM_MSG_SETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), [XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), }; |
