diff options
Diffstat (limited to 'net')
88 files changed, 1919 insertions, 890 deletions
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 59a5c1341c26..a0f99baafd35 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -571,6 +571,7 @@ int hci_dev_close(__u16 dev) goto done; } + cancel_work_sync(&hdev->power_on); if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); @@ -2675,6 +2676,8 @@ void hci_unregister_dev(struct hci_dev *hdev) list_del(&hdev->list); write_unlock(&hci_dev_list_lock); + cancel_work_sync(&hdev->power_on); + hci_cmd_sync_clear(hdev); if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 286d6767f017..1739e8cb3291 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4088,7 +4088,6 @@ int hci_dev_close_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, ""); - cancel_work_sync(&hdev->power_on); cancel_delayed_work(&hdev->power_off); cancel_delayed_work(&hdev->ncmd_timer); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 4fd882686b04..ff4779036649 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1012,9 +1012,24 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, return okfn(net, sk, skb); ops = nf_hook_entries_get_hook_ops(e); - for (i = 0; i < e->num_hook_entries && - ops[i]->priority <= NF_BR_PRI_BRNF; i++) - ; + for (i = 0; i < e->num_hook_entries; i++) { + /* These hooks have already been called */ + if (ops[i]->priority < NF_BR_PRI_BRNF) + continue; + + /* These hooks have not been called yet, run them. */ + if (ops[i]->priority > NF_BR_PRI_BRNF) + break; + + /* take a closer look at NF_BR_PRI_BRNF. */ + if (ops[i]->hook == br_nf_pre_routing) { + /* This hook diverted the skb to this function, + * hooks after this have not been run yet. + */ + i++; + break; + } + } nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); diff --git a/net/can/Kconfig b/net/can/Kconfig index a9ac5ffab286..cb56be8e3862 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -15,7 +15,8 @@ menuconfig CAN PF_CAN is contained in <Documentation/networking/can.rst>. If you want CAN support you should say Y here and also to the - specific driver for your controller(s) below. + specific driver for your controller(s) under the Network device + support section. if CAN @@ -69,6 +70,4 @@ config CAN_ISOTP If you want to perform automotive vehicle diagnostic services (UDS), say 'y'. -source "drivers/net/can/Kconfig" - endif diff --git a/net/can/bcm.c b/net/can/bcm.c index 65ee1b784a30..e60161bec850 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -100,6 +100,7 @@ static inline u64 get_u64(const struct canfd_frame *cp, int offset) struct bcm_op { struct list_head list; + struct rcu_head rcu; int ifindex; canid_t can_id; u32 flags; @@ -718,10 +719,9 @@ static struct bcm_op *bcm_find_op(struct list_head *ops, return NULL; } -static void bcm_remove_op(struct bcm_op *op) +static void bcm_free_op_rcu(struct rcu_head *rcu_head) { - hrtimer_cancel(&op->timer); - hrtimer_cancel(&op->thrtimer); + struct bcm_op *op = container_of(rcu_head, struct bcm_op, rcu); if ((op->frames) && (op->frames != &op->sframe)) kfree(op->frames); @@ -732,6 +732,14 @@ static void bcm_remove_op(struct bcm_op *op) kfree(op); } +static void bcm_remove_op(struct bcm_op *op) +{ + hrtimer_cancel(&op->timer); + hrtimer_cancel(&op->thrtimer); + + call_rcu(&op->rcu, bcm_free_op_rcu); +} + static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) { if (op->rx_reg_dev == dev) { @@ -757,6 +765,9 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { + /* disable automatic timer on frame reception */ + op->flags |= RX_NO_AUTOTIMER; + /* * Don't care if we're bound or not (due to netdev * problems) can_rx_unregister() is always a save @@ -785,7 +796,6 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, bcm_rx_handler, op); list_del(&op->list); - synchronize_rcu(); bcm_remove_op(op); return 1; /* done */ } diff --git a/net/core/dev.c b/net/core/dev.c index 8958c4227b67..978ed0622d8f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -397,16 +397,18 @@ static void list_netdevice(struct net_device *dev) /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ -static void unlist_netdevice(struct net_device *dev) +static void unlist_netdevice(struct net_device *dev, bool lock) { ASSERT_RTNL(); /* Unlink dev from the device chain */ - write_lock(&dev_base_lock); + if (lock) + write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock(&dev_base_lock); + if (lock) + write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -10061,11 +10063,11 @@ int register_netdevice(struct net_device *dev) goto err_uninit; ret = netdev_register_kobject(dev); - if (ret) { - dev->reg_state = NETREG_UNREGISTERED; + write_lock(&dev_base_lock); + dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; + write_unlock(&dev_base_lock); + if (ret) goto err_uninit; - } - dev->reg_state = NETREG_REGISTERED; __netdev_update_features(dev); @@ -10347,7 +10349,9 @@ void netdev_run_todo(void) continue; } + write_lock(&dev_base_lock); dev->reg_state = NETREG_UNREGISTERED; + write_unlock(&dev_base_lock); linkwatch_forget_dev(dev); } @@ -10828,9 +10832,10 @@ void unregister_netdevice_many(struct list_head *head) list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ - unlist_netdevice(dev); - + write_lock(&dev_base_lock); + unlist_netdevice(dev, false); dev->reg_state = NETREG_UNREGISTERING; + write_unlock(&dev_base_lock); } flush_all_backlogs(); @@ -10977,7 +10982,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_close(dev); /* And unlink it from device chain */ - unlist_netdevice(dev); + unlist_netdevice(dev, true); synchronize_net(); diff --git a/net/core/filter.c b/net/core/filter.c index 4fae91984359..4ef77ec5255e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6559,10 +6559,21 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, ifindex, proto, netns_id, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } @@ -6596,10 +6607,21 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d8ec70622ecb..6a8c2596ebab 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1579,7 +1579,7 @@ static void neigh_managed_work(struct work_struct *work) list_for_each_entry(neigh, &tbl->managed_list, managed_list) neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, - max(NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME), HZ)); + NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); write_unlock_bh(&tbl->lock); } @@ -2100,7 +2100,9 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) nla_put_msecs(skb, NDTPA_PROXY_DELAY, NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_LOCKTIME, - NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD)) + NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS, + NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -2255,6 +2257,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, [NDTPA_LOCKTIME] = { .type = NLA_U64 }, + [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 }, }; static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2373,6 +2376,10 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, nla_get_msecs(tbp[i])); call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; + case NDTPA_INTERVAL_PROBE_TIME_MS: + NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS, + nla_get_msecs(tbp[i])); + break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, nla_get_msecs(tbp[i])); @@ -3562,6 +3569,22 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, return ret; } +static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *ctl; + int ret; + + int min = msecs_to_jiffies(1); + + tmp.extra1 = &min; + tmp.extra2 = NULL; + + ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos); + neigh_proc_update(ctl, write); + return ret; +} + int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3658,6 +3681,9 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, #define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) +#define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive) + #define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) @@ -3676,6 +3702,8 @@ static struct neigh_sysctl_table { NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), + NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS, + "interval_probe_time_ms"), NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index d49fc974e630..d61afd21aab5 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -33,6 +33,7 @@ static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; +/* Caller holds RTNL or dev_base_lock */ static inline int dev_isalive(const struct net_device *dev) { return dev->reg_state <= NETREG_REGISTERED; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f18e6e771993..b74905fcc3a1 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -389,7 +389,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); - nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache); + nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, + pool->alloc.cache); if (unlikely(!nr_pages)) return NULL; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 00bf35ee8205..c4a751781581 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -454,8 +454,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->fclone = SKB_FCLONE_ORIG; refcount_set(&fclones->fclone_ref, 1); - - fclones->skb2.fclone = SKB_FCLONE_CLONE; } return skb; @@ -1513,6 +1511,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) refcount_read(&fclones->fclone_ref) == 1) { n = &fclones->skb2; refcount_set(&fclones->fclone_ref, 2); + n->fclone = SKB_FCLONE_CLONE; } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -3195,9 +3194,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) } } - to->truesize += len + plen; - to->len += len + plen; - to->data_len += len + plen; + skb_len_add(to, len + plen); if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { skb_tx_error(from); @@ -3634,13 +3631,8 @@ onlymerged: tgt->ip_summed = CHECKSUM_PARTIAL; skb->ip_summed = CHECKSUM_PARTIAL; - /* Yak, is it really working this way? Some helper please? */ - skb->len -= shiftlen; - skb->data_len -= shiftlen; - skb->truesize -= shiftlen; - tgt->len += shiftlen; - tgt->data_len += shiftlen; - tgt->truesize += shiftlen; + skb_len_add(skb, -shiftlen); + skb_len_add(tgt, shiftlen); return shiftlen; } diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4b297d67edb7..266d3b7b7d0b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -702,6 +702,11 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) write_lock_bh(&sk->sk_callback_lock); + if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) { + psock = ERR_PTR(-EINVAL); + goto out; + } + if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; diff --git a/net/core/sock.c b/net/core/sock.c index 92a0296ccb18..4cb957d934a2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2870,6 +2870,7 @@ void __sk_flush_backlog(struct sock *sk) __release_sock(sk); spin_unlock_bh(&sk->sk_lock.slock); } +EXPORT_SYMBOL_GPL(__sk_flush_backlog); /** * sk_wait_data - wait for data to arrive at sk_receive_queue diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index fbd98ac853ea..7c569bcc0aca 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -94,6 +94,7 @@ struct neigh_table dn_neigh_table = { [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, + [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 0, diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 8cb87b5067ee..3eef72ce99a4 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -87,10 +87,10 @@ config NET_DSA_TAG_MTK Mediatek switches. config NET_DSA_TAG_KSZ - tristate "Tag driver for Microchip 8795/9477/9893 families of switches" + tristate "Tag driver for Microchip 8795/937x/9477/9893 families of switches" help Say Y if you want to enable support for tagging frames for the - Microchip 8795/9477/9893 families of switches. + Microchip 8795/937x/9477/9893 families of switches. config NET_DSA_TAG_OCELOT tristate "Tag driver for Ocelot family of switches, using NPI port" @@ -132,6 +132,13 @@ config NET_DSA_TAG_RTL8_4 Say Y or M if you want to enable support for tagging frames for Realtek switches with 8 byte protocol 4 tags, such as the Realtek RTL8365MB-VC. +config NET_DSA_TAG_RZN1_A5PSW + tristate "Tag driver for Renesas RZ/N1 A5PSW switch" + help + Say Y or M if you want to enable support for tagging frames for + Renesas RZ/N1 embedded switch that uses an 8 byte tag located after + destination MAC address. + config NET_DSA_TAG_LAN9303 tristate "Tag driver for SMSC/Microchip LAN9303 family of switches" help diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 9f75820e7c98..af28c24ead18 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o obj-$(CONFIG_NET_DSA_TAG_RTL8_4) += tag_rtl8_4.o +obj-$(CONFIG_NET_DSA_TAG_RZN1_A5PSW) += tag_rzn1_a5psw.o obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2e1ac638d135..ad6a6663feeb 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1002,6 +1002,18 @@ dsa_slave_get_eth_ctrl_stats(struct net_device *dev, ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats); } +static void +dsa_slave_get_rmon_stats(struct net_device *dev, + struct ethtool_rmon_stats *rmon_stats, + const struct ethtool_rmon_hist_range **ranges) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_rmon_stats) + ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges); +} + static void dsa_slave_net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { @@ -1097,6 +1109,16 @@ static int dsa_slave_set_link_ksettings(struct net_device *dev, return phylink_ethtool_ksettings_set(dp->pl, cmd); } +static void dsa_slave_get_pause_stats(struct net_device *dev, + struct ethtool_pause_stats *pause_stats) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_pause_stats) + ds->ops->get_pause_stats(ds, dp->index, pause_stats); +} + static void dsa_slave_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { @@ -2081,12 +2103,14 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_eth_phy_stats = dsa_slave_get_eth_phy_stats, .get_eth_mac_stats = dsa_slave_get_eth_mac_stats, .get_eth_ctrl_stats = dsa_slave_get_eth_ctrl_stats, + .get_rmon_stats = dsa_slave_get_rmon_stats, .set_wol = dsa_slave_set_wol, .get_wol = dsa_slave_get_wol, .set_eee = dsa_slave_set_eee, .get_eee = dsa_slave_get_eee, .get_link_ksettings = dsa_slave_get_link_ksettings, .set_link_ksettings = dsa_slave_set_link_ksettings, + .get_pause_stats = dsa_slave_get_pause_stats, .get_pauseparam = dsa_slave_get_pauseparam, .set_pauseparam = dsa_slave_set_pauseparam, .get_rxnfc = dsa_slave_get_rxnfc, @@ -2460,8 +2484,9 @@ static int dsa_slave_changeupper(struct net_device *dev, if (!err) dsa_bridge_mtu_normalization(dp); if (err == -EOPNOTSUPP) { - NL_SET_ERR_MSG_MOD(extack, - "Offloading not supported"); + if (!extack->_msg) + NL_SET_ERR_MSG_MOD(extack, + "Offloading not supported"); err = 0; } err = notifier_from_errno(err); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 3509fc967ca9..38fa19c1e2d5 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -193,10 +193,69 @@ static const struct dsa_device_ops ksz9893_netdev_ops = { DSA_TAG_DRIVER(ksz9893_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893); +/* For xmit, 2 bytes are added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag0 : represents tag override, lookup and valid + * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x80=port8) + * + * For rcv, 1 byte is added before FCS. + * --------------------------------------------------------------------------- + * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) + * --------------------------------------------------------------------------- + * tag0 : zero-based value represents port + * (eg, 0x00=port1, 0x02=port3, 0x07=port8) + */ +#define LAN937X_EGRESS_TAG_LEN 2 + +#define LAN937X_TAIL_TAG_BLOCKING_OVERRIDE BIT(11) +#define LAN937X_TAIL_TAG_LOOKUP BIT(12) +#define LAN937X_TAIL_TAG_VALID BIT(13) +#define LAN937X_TAIL_TAG_PORT_MASK 7 + +static struct sk_buff *lan937x_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + const struct ethhdr *hdr = eth_hdr(skb); + __be16 *tag; + u16 val; + + if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) + return NULL; + + tag = skb_put(skb, LAN937X_EGRESS_TAG_LEN); + + val = BIT(dp->index); + + if (is_link_local_ether_addr(hdr->h_dest)) + val |= LAN937X_TAIL_TAG_BLOCKING_OVERRIDE; + + /* Tail tag valid bit - This bit should always be set by the CPU */ + val |= LAN937X_TAIL_TAG_VALID; + + put_unaligned_be16(val, tag); + + return skb; +} + +static const struct dsa_device_ops lan937x_netdev_ops = { + .name = "lan937x", + .proto = DSA_TAG_PROTO_LAN937X, + .xmit = lan937x_xmit, + .rcv = ksz9477_rcv, + .needed_tailroom = LAN937X_EGRESS_TAG_LEN, +}; + +DSA_TAG_DRIVER(lan937x_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN937X); + static struct dsa_tag_driver *dsa_tag_driver_array[] = { &DSA_TAG_DRIVER_NAME(ksz8795_netdev_ops), &DSA_TAG_DRIVER_NAME(ksz9477_netdev_ops), &DSA_TAG_DRIVER_NAME(ksz9893_netdev_ops), + &DSA_TAG_DRIVER_NAME(lan937x_netdev_ops), }; module_dsa_tag_drivers(dsa_tag_driver_array); diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c new file mode 100644 index 000000000000..e2a5ee6ae688 --- /dev/null +++ b/net/dsa/tag_rzn1_a5psw.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 Schneider Electric + * + * Clément Léger <clement.leger@bootlin.com> + */ + +#include <linux/bitfield.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <net/dsa.h> + +#include "dsa_priv.h" + +/* To define the outgoing port and to discover the incoming port a TAG is + * inserted after Src MAC : + * + * Dest MAC Src MAC TAG Type + * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 5 6 7 8 | 1 2 |... + * |<--------------->| + * + * See struct a5psw_tag for layout + */ + +#define ETH_P_DSA_A5PSW 0xE001 +#define A5PSW_TAG_LEN 8 +#define A5PSW_CTRL_DATA_FORCE_FORWARD BIT(0) +/* This is both used for xmit tag and rcv tagging */ +#define A5PSW_CTRL_DATA_PORT GENMASK(3, 0) + +struct a5psw_tag { + __be16 ctrl_tag; + __be16 ctrl_data; + __be16 ctrl_data2_hi; + __be16 ctrl_data2_lo; +}; + +static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct a5psw_tag *ptag; + u32 data2_val; + + BUILD_BUG_ON(sizeof(*ptag) != A5PSW_TAG_LEN); + + /* The Ethernet switch we are interfaced with needs packets to be at + * least 60 bytes otherwise they will be discarded when they enter the + * switch port logic. + */ + if (__skb_put_padto(skb, ETH_ZLEN, false)) + return NULL; + + /* provide 'A5PSW_TAG_LEN' bytes additional space */ + skb_push(skb, A5PSW_TAG_LEN); + + /* make room between MACs and Ether-Type to insert tag */ + dsa_alloc_etype_header(skb, A5PSW_TAG_LEN); + + ptag = dsa_etype_header_pos_tx(skb); + + data2_val = FIELD_PREP(A5PSW_CTRL_DATA_PORT, BIT(dp->index)); + ptag->ctrl_tag = htons(ETH_P_DSA_A5PSW); + ptag->ctrl_data = htons(A5PSW_CTRL_DATA_FORCE_FORWARD); + ptag->ctrl_data2_lo = htons(data2_val); + ptag->ctrl_data2_hi = 0; + + return skb; +} + +static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb, + struct net_device *dev) +{ + struct a5psw_tag *tag; + int port; + + if (unlikely(!pskb_may_pull(skb, A5PSW_TAG_LEN))) { + dev_warn_ratelimited(&dev->dev, + "Dropping packet, cannot pull\n"); + return NULL; + } + + tag = dsa_etype_header_pos_rx(skb); + + if (tag->ctrl_tag != htons(ETH_P_DSA_A5PSW)) { + dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid TAG marker\n"); + return NULL; + } + + port = FIELD_GET(A5PSW_CTRL_DATA_PORT, ntohs(tag->ctrl_data)); + + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) + return NULL; + + skb_pull_rcsum(skb, A5PSW_TAG_LEN); + dsa_strip_etype_header(skb, A5PSW_TAG_LEN); + + dsa_default_offload_fwd_mark(skb); + + return skb; +} + +static const struct dsa_device_ops a5psw_netdev_ops = { + .name = "a5psw", + .proto = DSA_TAG_PROTO_RZN1_A5PSW, + .xmit = a5psw_tag_xmit, + .rcv = a5psw_tag_rcv, + .needed_headroom = A5PSW_TAG_LEN, +}; + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_A5PSW); +module_dsa_tag_driver(a5psw_netdev_ops); diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index 7e6b37a54add..1c94bb8ea03f 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -36,7 +36,7 @@ static int fallback_set_params(struct eeprom_req_info *request, if (request->page) offset = request->page * ETH_MODULE_EEPROM_PAGE_LEN + offset; - if (modinfo->type == ETH_MODULE_SFF_8079 && + if (modinfo->type == ETH_MODULE_SFF_8472 && request->i2c_address == 0x51) offset += ETH_MODULE_EEPROM_PAGE_LEN * 2; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index ab4a5601c82a..af2f12ffc9ca 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -168,6 +168,7 @@ struct neigh_table arp_tbl = { [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, + [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b21238df3301..7eae8d686e20 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -502,9 +502,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * nfrags++; - skb->len += tailen; - skb->data_len += tailen; - skb->truesize += tailen; + skb_len_add(skb, tailen); if (sk && sk_fullsock(sk)) refcount_add(tailen, &sk->sk_wmem_alloc); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 3b9cd487075a..5c58e21f724e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -524,7 +524,6 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) int tunnel_hlen; int version; int nhoff; - int thoff; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -558,10 +557,16 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) truncate = true; - thoff = skb_transport_header(skb) - skb_mac_header(skb); - if (skb->protocol == htons(ETH_P_IPV6) && - (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)) - truncate = true; + if (skb->protocol == htons(ETH_P_IPV6)) { + int thoff; + + if (skb_transport_header_was_set(skb)) + thoff = skb_transport_header(skb) - skb_mac_header(skb); + else + thoff = nhoff + sizeof(struct ipv6hdr); + if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff) + truncate = true; + } if (version == 1) { erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 00b4bf26fd93..5e32a2f86fbd 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1214,9 +1214,7 @@ alloc_new_skb: pfrag->offset += copy; skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; + skb_len_add(skb, copy); wmem_alloc_delta += copy; } else { err = skb_zerocopy_iter_dgram(skb, from, copy); @@ -1443,9 +1441,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, skb->csum = csum_block_add(skb->csum, csum, skb->len); } - skb->len += len; - skb->data_len += len; - skb->truesize += len; + skb_len_add(skb, len); refcount_add(len, &sk->sk_wmem_alloc); offset += len; size -= len; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 6b2dc7b2b612..cc1caab4a654 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -410,7 +410,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, u32 mtu = dst_mtu(encap_dst) - headroom; if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) || - (!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu)) + (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu)) return 0; skb_dst_update_pmtu_no_confirm(skb, mtu); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 9d41d5d5cd1e..f53a0f2453af 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1759,15 +1759,15 @@ static int __init ip_auto_config_setup(char *addrs) case 4: if ((dp = strchr(ip, '.'))) { *dp++ = '\0'; - strlcpy(utsname()->domainname, dp, + strscpy(utsname()->domainname, dp, sizeof(utsname()->domainname)); } - strlcpy(utsname()->nodename, ip, + strscpy(utsname()->nodename, ip, sizeof(utsname()->nodename)); ic_host_name_set = 1; break; case 5: - strlcpy(user_dev_name, ip, sizeof(user_dev_name)); + strscpy(user_dev_name, ip, sizeof(user_dev_name)); break; case 6: if (ic_proto_name(ip) == 0 && @@ -1814,7 +1814,7 @@ __setup("nfsaddrs=", nfsaddrs_config_setup); static int __init vendor_class_identifier_setup(char *addrs) { - if (strlcpy(vendor_class_identifier, addrs, + if (strscpy(vendor_class_identifier, addrs, sizeof(vendor_class_identifier)) >= sizeof(vendor_class_identifier)) pr_warn("DHCP: vendorclass too long, truncated to \"%s\"\n", diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 8324e541d193..73651d17e51f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -77,7 +77,12 @@ struct ipmr_result { * Note that the changes are semaphored via rtnl_lock. */ -static DEFINE_RWLOCK(mrt_lock); +static DEFINE_SPINLOCK(mrt_lock); + +static struct net_device *vif_dev_read(const struct vif_device *vif) +{ + return rcu_dereference(vif->dev); +} /* Multicast router control variables */ @@ -100,11 +105,11 @@ static void ipmr_free_table(struct mr_table *mrt); static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc_cache *cache, int local); -static int ipmr_cache_report(struct mr_table *mrt, +static int ipmr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); -static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); +static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); @@ -501,11 +506,15 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) return err; } - read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; dev->stats.tx_packets++; - ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT); - read_unlock(&mrt_lock); + rcu_read_lock(); + + /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ + ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), + IGMPMSG_WHOLEPKT); + + rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; } @@ -572,6 +581,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, { struct net_device *reg_dev = NULL; struct iphdr *encap; + int vif_num; encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); /* Check that: @@ -584,11 +594,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, ntohs(encap->tot_len) + pimlen > skb->len) return 1; - read_lock(&mrt_lock); - if (mrt->mroute_reg_vif_num >= 0) - reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; - read_unlock(&mrt_lock); - + /* Pairs with WRITE_ONCE() in vif_add()/vid_delete() */ + vif_num = READ_ONCE(mrt->mroute_reg_vif_num); + if (vif_num >= 0) + reg_dev = vif_dev_read(&mrt->vif_table[vif_num]); if (!reg_dev) return 1; @@ -614,10 +623,11 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) static int call_ipmr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, + struct net_device *vif_dev, vifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type, - vif, vif_index, tb_id, + vif, vif_dev, vif_index, tb_id, &net->ipv4.ipmr_seq); } @@ -649,22 +659,19 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, v = &mrt->vif_table[vifi]; - if (VIF_EXISTS(mrt, vifi)) - call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi, - mrt->id); - - write_lock_bh(&mrt_lock); - dev = v->dev; - v->dev = NULL; - - if (!dev) { - write_unlock_bh(&mrt_lock); + dev = rtnl_dereference(v->dev); + if (!dev) return -EADDRNOTAVAIL; - } - if (vifi == mrt->mroute_reg_vif_num) - mrt->mroute_reg_vif_num = -1; + spin_lock(&mrt_lock); + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev, + vifi, mrt->id); + RCU_INIT_POINTER(v->dev, NULL); + if (vifi == mrt->mroute_reg_vif_num) { + /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */ + WRITE_ONCE(mrt->mroute_reg_vif_num, -1); + } if (vifi + 1 == mrt->maxvif) { int tmp; @@ -672,10 +679,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, if (VIF_EXISTS(mrt, tmp)) break; } - mrt->maxvif = tmp+1; + WRITE_ONCE(mrt->maxvif, tmp + 1); } - write_unlock_bh(&mrt_lock); + spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); @@ -777,7 +784,7 @@ out: spin_unlock(&mfc_unres_lock); } -/* Fill oifs list. It is called under write locked mrt_lock. */ +/* Fill oifs list. It is called under locked mrt_lock. */ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { @@ -889,15 +896,18 @@ static int vif_add(struct net *net, struct mr_table *mrt, v->remote = vifc->vifc_rmt_addr.s_addr; /* And finish update writing critical data */ - write_lock_bh(&mrt_lock); - v->dev = dev; + spin_lock(&mrt_lock); + rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); - if (v->flags & VIFF_REGISTER) - mrt->mroute_reg_vif_num = vifi; + if (v->flags & VIFF_REGISTER) { + /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */ + WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); + } if (vifi+1 > mrt->maxvif) - mrt->maxvif = vifi+1; - write_unlock_bh(&mrt_lock); - call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id); + WRITE_ONCE(mrt->maxvif, vifi + 1); + spin_unlock(&mrt_lock); + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, + vifi, mrt->id); return 0; } @@ -1001,9 +1011,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, /* Bounce a cache query up to mrouted and netlink. * - * Called under mrt_lock. + * Called under rcu_read_lock(). */ -static int ipmr_cache_report(struct mr_table *mrt, +static int ipmr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert) { const int ihl = ip_hdrlen(pkt); @@ -1038,8 +1048,11 @@ static int ipmr_cache_report(struct mr_table *mrt, msg->im_vif = vifi; msg->im_vif_hi = vifi >> 8; } else { - msg->im_vif = mrt->mroute_reg_vif_num; - msg->im_vif_hi = mrt->mroute_reg_vif_num >> 8; + /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ + int vif_num = READ_ONCE(mrt->mroute_reg_vif_num); + + msg->im_vif = vif_num; + msg->im_vif_hi = vif_num >> 8; } ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + @@ -1064,10 +1077,8 @@ static int ipmr_cache_report(struct mr_table *mrt, skb->transport_header = skb->network_header; } - rcu_read_lock(); mroute_sk = rcu_dereference(mrt->mroute_sk); if (!mroute_sk) { - rcu_read_unlock(); kfree_skb(skb); return -EINVAL; } @@ -1076,7 +1087,7 @@ static int ipmr_cache_report(struct mr_table *mrt, /* Deliver to mrouted */ ret = sock_queue_rcv_skb(mroute_sk, skb); - rcu_read_unlock(); + if (ret < 0) { net_warn_ratelimited("mroute: pending queue full, dropping entries\n"); kfree_skb(skb); @@ -1086,6 +1097,7 @@ static int ipmr_cache_report(struct mr_table *mrt, } /* Queue a packet for resolution. It gets locked cache entry! */ +/* Called under rcu_read_lock() */ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb, struct net_device *dev) { @@ -1198,12 +1210,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, mfc->mfcc_mcastgrp.s_addr, parent); rcu_read_unlock(); if (c) { - write_lock_bh(&mrt_lock); + spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mfcc_parent; ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; - write_unlock_bh(&mrt_lock); + spin_unlock(&mrt_lock); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); @@ -1598,20 +1610,20 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) if (vr.vifi >= mrt->maxvif) return -EINVAL; vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); - read_lock(&mrt_lock); + rcu_read_lock(); vif = &mrt->vif_table[vr.vifi]; if (VIF_EXISTS(mrt, vr.vifi)) { - vr.icount = vif->pkt_in; - vr.ocount = vif->pkt_out; - vr.ibytes = vif->bytes_in; - vr.obytes = vif->bytes_out; - read_unlock(&mrt_lock); + vr.icount = READ_ONCE(vif->pkt_in); + vr.ocount = READ_ONCE(vif->pkt_out); + vr.ibytes = READ_ONCE(vif->bytes_in); + vr.obytes = READ_ONCE(vif->bytes_out); + rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT: if (copy_from_user(&sr, arg, sizeof(sr))) @@ -1673,20 +1685,20 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) if (vr.vifi >= mrt->maxvif) return -EINVAL; vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); - read_lock(&mrt_lock); + rcu_read_lock(); vif = &mrt->vif_table[vr.vifi]; if (VIF_EXISTS(mrt, vr.vifi)) { - vr.icount = vif->pkt_in; - vr.ocount = vif->pkt_out; - vr.ibytes = vif->bytes_in; - vr.obytes = vif->bytes_out; - read_unlock(&mrt_lock); + vr.icount = READ_ONCE(vif->pkt_in); + vr.ocount = READ_ONCE(vif->pkt_out); + vr.ibytes = READ_ONCE(vif->bytes_in); + vr.obytes = READ_ONCE(vif->bytes_out); + rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT: if (copy_from_user(&sr, arg, sizeof(sr))) @@ -1726,7 +1738,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v ipmr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { - if (v->dev == dev) + if (rcu_access_pointer(v->dev) == dev) vif_delete(mrt, ct, 1, NULL); } } @@ -1804,26 +1816,28 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, } #endif -/* Processing handlers for ipmr_forward */ +/* Processing handlers for ipmr_forward, under rcu_read_lock() */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, int in_vifi, struct sk_buff *skb, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; + struct net_device *vif_dev; struct net_device *dev; struct rtable *rt; struct flowi4 fl4; int encap = 0; - if (!vif->dev) + vif_dev = vif_dev_read(vif); + if (!vif_dev) goto out_free; if (vif->flags & VIFF_REGISTER) { - vif->pkt_out++; - vif->bytes_out += skb->len; - vif->dev->stats.tx_bytes += skb->len; - vif->dev->stats.tx_packets++; + WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); + WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); + vif_dev->stats.tx_bytes += skb->len; + vif_dev->stats.tx_packets++; ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); goto out_free; } @@ -1868,8 +1882,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } - vif->pkt_out++; - vif->bytes_out += skb->len; + WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); + WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); @@ -1881,8 +1895,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, if (vif->flags & VIFF_TUNNEL) { ip_encap(net, skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ - vif->dev->stats.tx_packets++; - vif->dev->stats.tx_bytes += skb->len; + vif_dev->stats.tx_packets++; + vif_dev->stats.tx_bytes += skb->len; } IPCB(skb)->flags |= IPSKB_FORWARDED; @@ -1906,18 +1920,20 @@ out_free: kfree_skb(skb); } -static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) +/* Called with mrt_lock or rcu_read_lock() */ +static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev) { int ct; - - for (ct = mrt->maxvif-1; ct >= 0; ct--) { - if (mrt->vif_table[ct].dev == dev) + /* Pairs with WRITE_ONCE() in vif_delete()/vif_add() */ + for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { + if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* "local" means that we should preserve one skb (for local delivery) */ +/* Called uner rcu_read_lock() */ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc_cache *c, int local) @@ -1944,7 +1960,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, } /* Wrong interface: drop packet and (maybe) send PIM assert. */ - if (mrt->vif_table[vif].dev != dev) { + if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. * Very complicated situation... @@ -1983,8 +1999,10 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, } forward: - mrt->vif_table[vif].pkt_in++; - mrt->vif_table[vif].bytes_in += skb->len; + WRITE_ONCE(mrt->vif_table[vif].pkt_in, + mrt->vif_table[vif].pkt_in + 1); + WRITE_ONCE(mrt->vif_table[vif].bytes_in, + mrt->vif_table[vif].bytes_in + skb->len); /* Forward the frame */ if (c->mfc_origin == htonl(INADDR_ANY) && @@ -2140,22 +2158,14 @@ int ip_mr_input(struct sk_buff *skb) skb = skb2; } - read_lock(&mrt_lock); vif = ipmr_find_vif(mrt, dev); - if (vif >= 0) { - int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev); - read_unlock(&mrt_lock); - - return err2; - } - read_unlock(&mrt_lock); + if (vif >= 0) + return ipmr_cache_unresolved(mrt, vif, skb, dev); kfree_skb(skb); return -ENODEV; } - read_lock(&mrt_lock); ip_mr_forward(net, mrt, dev, skb, cache, local); - read_unlock(&mrt_lock); if (local) return ip_local_deliver(skb); @@ -2252,18 +2262,15 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, int vif = -1; dev = skb->dev; - read_lock(&mrt_lock); if (dev) vif = ipmr_find_vif(mrt, dev); if (vif < 0) { - read_unlock(&mrt_lock); rcu_read_unlock(); return -ENODEV; } skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr)); if (!skb2) { - read_unlock(&mrt_lock); rcu_read_unlock(); return -ENOMEM; } @@ -2277,14 +2284,11 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, iph->daddr = daddr; iph->version = 0; err = ipmr_cache_unresolved(mrt, vif, skb2, dev); - read_unlock(&mrt_lock); rcu_read_unlock(); return err; } - read_lock(&mrt_lock); err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); - read_unlock(&mrt_lock); rcu_read_unlock(); return err; } @@ -2404,7 +2408,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen) return len; } -static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) +static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; @@ -2744,18 +2748,21 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) { + struct net_device *vif_dev; struct nlattr *vif_nest; struct vif_device *vif; + vif = &mrt->vif_table[vifid]; + vif_dev = rtnl_dereference(vif->dev); /* if the VIF doesn't exist just continue */ - if (!VIF_EXISTS(mrt, vifid)) + if (!vif_dev) return true; - vif = &mrt->vif_table[vifid]; vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF); if (!vif_nest) return false; - if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) || + + if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) || nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) || nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) || nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in, @@ -2887,7 +2894,7 @@ out: */ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(mrt_lock) + __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); @@ -2899,14 +2906,14 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) iter->mrt = mrt; - read_lock(&mrt_lock); + rcu_read_lock(); return mr_vif_seq_start(seq, pos); } static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) - __releases(mrt_lock) + __releases(RCU) { - read_unlock(&mrt_lock); + rcu_read_unlock(); } static int ipmr_vif_seq_show(struct seq_file *seq, void *v) @@ -2919,9 +2926,11 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); } else { const struct vif_device *vif = v; - const char *name = vif->dev ? - vif->dev->name : "none"; + const struct net_device *vif_dev; + const char *name; + vif_dev = vif_dev_read(vif); + name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", vif - mrt->vif_table, @@ -3017,7 +3026,7 @@ static int ipmr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, - ipmr_mr_table_iter, &mrt_lock, extack); + ipmr_mr_table_iter, extack); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index aa8738a91210..271dc03fc6db 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -13,7 +13,7 @@ void vif_device_init(struct vif_device *v, unsigned short flags, unsigned short get_iflink_mask) { - v->dev = NULL; + RCU_INIT_POINTER(v->dev, NULL); v->bytes_in = 0; v->bytes_out = 0; v->pkt_in = 0; @@ -208,6 +208,7 @@ EXPORT_SYMBOL(mr_mfc_seq_next); int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm) { + struct net_device *vif_dev; struct rta_mfc_stats mfcs; struct nlattr *mp_attr; struct rtnexthop *nhp; @@ -220,10 +221,13 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, return -ENOENT; } - if (VIF_EXISTS(mrt, c->mfc_parent) && - nla_put_u32(skb, RTA_IIF, - mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) + rcu_read_lock(); + vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev); + if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) { + rcu_read_unlock(); return -EMSGSIZE; + } + rcu_read_unlock(); if (c->mfc_flags & MFC_OFFLOAD) rtm->rtm_flags |= RTNH_F_OFFLOAD; @@ -232,23 +236,27 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, if (!mp_attr) return -EMSGSIZE; + rcu_read_lock(); for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { - struct vif_device *vif; + struct vif_device *vif = &mrt->vif_table[ct]; + + vif_dev = rcu_dereference(vif->dev); + if (vif_dev && c->mfc_un.res.ttls[ct] < 255) { nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); if (!nhp) { + rcu_read_unlock(); nla_nest_cancel(skb, mp_attr); return -EMSGSIZE; } nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - vif = &mrt->vif_table[ct]; - nhp->rtnh_ifindex = vif->dev->ifindex; + nhp->rtnh_ifindex = vif_dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } + rcu_read_unlock(); nla_nest_end(skb, mp_attr); @@ -275,13 +283,14 @@ static bool mr_mfc_uses_dev(const struct mr_table *mrt, int ct; for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { - const struct vif_device *vif; - - vif = &mrt->vif_table[ct]; - if (vif->dev == dev) - return true; - } + const struct net_device *vif_dev; + const struct vif_device *vif; + + vif = &mrt->vif_table[ct]; + vif_dev = rcu_access_pointer(vif->dev); + if (vif_dev && c->mfc_un.res.ttls[ct] < 255 && + vif_dev == dev) + return true; } return false; } @@ -390,7 +399,6 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock, struct netlink_ext_ack *extack) { struct mr_table *mrt; @@ -402,22 +410,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) { struct vif_device *v = &mrt->vif_table[0]; + struct net_device *vif_dev; struct mr_mfc *mfc; int vifi; /* Notifiy on table VIF entries */ - read_lock(mrt_lock); + rcu_read_lock(); for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { - if (!v->dev) + vif_dev = rcu_dereference(v->dev); + if (!vif_dev) continue; err = mr_call_vif_notifier(nb, family, - FIB_EVENT_VIF_ADD, - v, vifi, mrt->id, extack); + FIB_EVENT_VIF_ADD, v, + vif_dev, vifi, + mrt->id, extack); if (err) break; } - read_unlock(mrt_lock); + rcu_read_unlock(); if (err) return err; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 3d6fc6d841b8..b83c2bd9d722 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -316,12 +316,16 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); + if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) + return 0; + tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); - if (!inet_addr_valid_or_nonlocal(net, inet_sk(sk), - addr->sin_addr.s_addr, - chk_addr_ret)) + if (chk_addr_ret == RTN_MULTICAST || + chk_addr_ret == RTN_BROADCAST || + (chk_addr_ret != RTN_LOCAL && + !inet_can_nonlocal_bind(net, isk))) return -EADDRNOTAVAIL; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 959bea12dc48..006c1f0ed8b4 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -95,10 +95,10 @@ int raw_hash_sk(struct sock *sk) hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)]; - write_lock_bh(&h->lock); + spin_lock(&h->lock); __sk_nulls_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); - write_unlock_bh(&h->lock); + spin_unlock(&h->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; @@ -109,10 +109,10 @@ void raw_unhash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; - write_lock_bh(&h->lock); + spin_lock(&h->lock); if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - write_unlock_bh(&h->lock); + spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_unhash_sk); @@ -278,7 +278,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) sk_nulls_for_each(sk, hnode, hlist) { iph = (const struct iphdr *)skb->data; if (!raw_v4_match(net, sk, iph->protocol, - iph->saddr, iph->daddr, dif, sdif)) + iph->daddr, iph->saddr, dif, sdif)) continue; raw_err(sk, skb, info); } diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index ac4b6525d3c6..999321834b94 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -156,7 +156,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, s_slot = cb->args[0]; num = s_num = cb->args[1]; - read_lock(&hashinfo->lock); + rcu_read_lock(); for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { num = 0; @@ -184,7 +184,7 @@ next: } out_unlock: - read_unlock(&hashinfo->lock); + rcu_read_unlock(); cb->args[0] = slot; cb->args[1] = num; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9d2fd3ced21b..21bdee88383b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4575,16 +4575,24 @@ EXPORT_SYMBOL_GPL(tcp_done); int tcp_abort(struct sock *sk, int err) { - if (!sk_fullsock(sk)) { - if (sk->sk_state == TCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); + int state = inet_sk_state_load(sk); - local_bh_disable(); - inet_csk_reqsk_queue_drop(req->rsk_listener, req); - local_bh_enable(); - return 0; - } - return -EOPNOTSUPP; + if (state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + + local_bh_disable(); + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + local_bh_enable(); + return 0; + } + if (state == TCP_TIME_WAIT) { + struct inet_timewait_sock *tw = inet_twsk(sk); + + refcount_inc(&tw->tw_refcnt); + local_bh_disable(); + inet_twsk_deschedule_put(tw); + local_bh_enable(); + return 0; } /* Don't race with userspace socket closes such as tcp_close. */ diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 38550bb1b90b..a1626afe87a1 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -612,9 +612,6 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) return 0; } - if (inet_csk_has_ulp(sk)) - return -EINVAL; - if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) return -EINVAL; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fda811a5251f..68d0d8a008e2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1964,7 +1964,10 @@ process: struct sock *nsk; sk = req->rsk_listener; - drop_reason = tcp_inbound_md5_hash(sk, skb, + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + drop_reason = SKB_DROP_REASON_XFRM_POLICY; + else + drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (unlikely(drop_reason)) { @@ -2016,6 +2019,7 @@ process: } goto discard_and_relse; } + nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v4_restore_cb(skb); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3497ad1362c0..88becb037eb6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1109,10 +1109,6 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, goto out; } - if (net->ipv6.devconf_all->disable_policy || - idev->cnf.disable_policy) - f6i->dst_nopolicy = true; - neigh_parms_data_state_setall(idev->nd_parms); ifa->addr = *cfg->pfx; @@ -4524,6 +4520,39 @@ restart: /* We try to batch several events at once. */ age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + if ((ifp->flags&IFA_F_TEMPORARY) && + !(ifp->flags&IFA_F_TENTATIVE) && + ifp->prefered_lft != INFINITY_LIFE_TIME && + !ifp->regen_count && ifp->ifpub) { + /* This is a non-regenerated temporary addr. */ + + unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * + ifp->idev->cnf.dad_transmits * + max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; + + if (age + regen_advance >= ifp->prefered_lft) { + struct inet6_ifaddr *ifpub = ifp->ifpub; + if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ; + + ifp->regen_count++; + in6_ifa_hold(ifp); + in6_ifa_hold(ifpub); + spin_unlock(&ifp->lock); + + spin_lock(&ifpub->lock); + ifpub->regen_count = 0; + spin_unlock(&ifpub->lock); + rcu_read_unlock_bh(); + ipv6_create_tempaddr(ifpub, true); + in6_ifa_put(ifpub); + in6_ifa_put(ifp); + rcu_read_lock_bh(); + goto restart; + } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ; + } + if (ifp->valid_lft != INFINITY_LIFE_TIME && age >= ifp->valid_lft) { spin_unlock(&ifp->lock); @@ -4557,35 +4586,6 @@ restart: in6_ifa_put(ifp); goto restart; } - } else if ((ifp->flags&IFA_F_TEMPORARY) && - !(ifp->flags&IFA_F_TENTATIVE)) { - unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * - ifp->idev->cnf.dad_transmits * - max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; - - if (age >= ifp->prefered_lft - regen_advance) { - struct inet6_ifaddr *ifpub = ifp->ifpub; - if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) - next = ifp->tstamp + ifp->prefered_lft * HZ; - if (!ifp->regen_count && ifpub) { - ifp->regen_count++; - in6_ifa_hold(ifp); - in6_ifa_hold(ifpub); - spin_unlock(&ifp->lock); - - spin_lock(&ifpub->lock); - ifpub->regen_count = 0; - spin_unlock(&ifpub->lock); - rcu_read_unlock_bh(); - ipv6_create_tempaddr(ifpub, true); - in6_ifa_put(ifpub); - in6_ifa_put(ifp); - rcu_read_lock_bh(); - goto restart; - } - } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) - next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ; - spin_unlock(&ifp->lock); } else { /* ifp->prefered_lft <= ifp->valid_lft */ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) @@ -5172,9 +5172,9 @@ next: fillargs->event = RTM_GETMULTICAST; /* multicast address */ - for (ifmca = rcu_dereference(idev->mc_list); + for (ifmca = rtnl_dereference(idev->mc_list); ifmca; - ifmca = rcu_dereference(ifmca->next), ip_idx++) { + ifmca = rtnl_dereference(ifmca->next), ip_idx++) { if (ip_idx < s_ip_idx) continue; err = inet6_fill_ifmcaddr(skb, ifmca, fillargs); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3e22cbe5966a..1bd10ae332e8 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -939,7 +939,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, __be16 proto; __u32 mtu; int nhoff; - int thoff; if (!pskb_inet_may_pull(skb)) goto tx_err; @@ -960,10 +959,16 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) truncate = true; - thoff = skb_transport_header(skb) - skb_mac_header(skb); - if (skb->protocol == htons(ETH_P_IPV6) && - (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)) - truncate = true; + if (skb->protocol == htons(ETH_P_IPV6)) { + int thoff; + + if (skb_transport_header_was_set(skb)) + thoff = skb_transport_header(skb) - skb_mac_header(skb); + else + thoff = nhoff + sizeof(struct ipv6hdr); + if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff) + truncate = true; + } if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen)) goto tx_err; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index d4aad41c9849..ec6e1509fc7c 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -62,7 +62,12 @@ struct ip6mr_result { Note that the changes are semaphored via rtnl_lock. */ -static DEFINE_RWLOCK(mrt_lock); +static DEFINE_SPINLOCK(mrt_lock); + +static struct net_device *vif_dev_read(const struct vif_device *vif) +{ + return rcu_dereference(vif->dev); +} /* Multicast router control variables */ @@ -85,11 +90,11 @@ static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *cache); -static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, +static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); -static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); +static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); static void mroute_clean_tables(struct mr_table *mrt, int flags); @@ -398,7 +403,7 @@ static void ip6mr_free_table(struct mr_table *mrt) */ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(mrt_lock) + __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); @@ -410,14 +415,14 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) iter->mrt = mrt; - read_lock(&mrt_lock); + rcu_read_lock(); return mr_vif_seq_start(seq, pos); } static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) - __releases(mrt_lock) + __releases(RCU) { - read_unlock(&mrt_lock); + rcu_read_unlock(); } static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) @@ -430,7 +435,11 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); } else { const struct vif_device *vif = v; - const char *name = vif->dev ? vif->dev->name : "none"; + const struct net_device *vif_dev; + const char *name; + + vif_dev = vif_dev_read(vif); + name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", @@ -549,13 +558,11 @@ static int pim6_rcv(struct sk_buff *skb) if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto drop; - reg_vif_num = mrt->mroute_reg_vif_num; - read_lock(&mrt_lock); + /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */ + reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num); if (reg_vif_num >= 0) - reg_dev = mrt->vif_table[reg_vif_num].dev; - dev_hold(reg_dev); - read_unlock(&mrt_lock); + reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]); if (!reg_dev) goto drop; @@ -570,7 +577,6 @@ static int pim6_rcv(struct sk_buff *skb) netif_rx(skb); - dev_put(reg_dev); return 0; drop: kfree_skb(skb); @@ -600,11 +606,12 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto tx_err; - read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; dev->stats.tx_packets++; - ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT); - read_unlock(&mrt_lock); + rcu_read_lock(); + ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), + MRT6MSG_WHOLEPKT); + rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; @@ -670,10 +677,11 @@ failure: static int call_ip6mr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, + struct net_device *vif_dev, mifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type, - vif, vif_index, tb_id, + vif, vif_dev, vif_index, tb_id, &net->ipv6.ipmr_seq); } @@ -698,23 +706,21 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, v = &mrt->vif_table[vifi]; - if (VIF_EXISTS(mrt, vifi)) - call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), - FIB_EVENT_VIF_DEL, v, vifi, - mrt->id); - - write_lock_bh(&mrt_lock); - dev = v->dev; - v->dev = NULL; - - if (!dev) { - write_unlock_bh(&mrt_lock); + dev = rtnl_dereference(v->dev); + if (!dev) return -EADDRNOTAVAIL; - } + + call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), + FIB_EVENT_VIF_DEL, v, dev, + vifi, mrt->id); + spin_lock(&mrt_lock); + RCU_INIT_POINTER(v->dev, NULL); #ifdef CONFIG_IPV6_PIMSM_V2 - if (vifi == mrt->mroute_reg_vif_num) - mrt->mroute_reg_vif_num = -1; + if (vifi == mrt->mroute_reg_vif_num) { + /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */ + WRITE_ONCE(mrt->mroute_reg_vif_num, -1); + } #endif if (vifi + 1 == mrt->maxvif) { @@ -723,10 +729,10 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, if (VIF_EXISTS(mrt, tmp)) break; } - mrt->maxvif = tmp + 1; + WRITE_ONCE(mrt->maxvif, tmp + 1); } - write_unlock_bh(&mrt_lock); + spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); @@ -826,7 +832,7 @@ static void ipmr_expire_process(struct timer_list *t) spin_unlock(&mfc_unres_lock); } -/* Fill oifs list. It is called under write locked mrt_lock. */ +/* Fill oifs list. It is called under locked mrt_lock. */ static void ip6mr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, @@ -912,18 +918,18 @@ static int mif6_add(struct net *net, struct mr_table *mrt, MIFF_REGISTER); /* And finish update writing critical data */ - write_lock_bh(&mrt_lock); - v->dev = dev; + spin_lock(&mrt_lock); + rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); #ifdef CONFIG_IPV6_PIMSM_V2 if (v->flags & MIFF_REGISTER) - mrt->mroute_reg_vif_num = vifi; + WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); #endif if (vifi + 1 > mrt->maxvif) - mrt->maxvif = vifi + 1; - write_unlock_bh(&mrt_lock); + WRITE_ONCE(mrt->maxvif, vifi + 1); + spin_unlock(&mrt_lock); call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, - v, vifi, mrt->id); + v, dev, vifi, mrt->id); return 0; } @@ -1028,10 +1034,10 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, /* * Bounce a cache query up to pim6sd and netlink. * - * Called under mrt_lock. + * Called under rcu_read_lock() */ -static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, +static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { struct sock *mroute6_sk; @@ -1072,7 +1078,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, if (assert == MRT6MSG_WRMIFWHOLE) msg->im6_mif = mifi; else - msg->im6_mif = mrt->mroute_reg_vif_num; + msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num); msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; @@ -1107,10 +1113,8 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, skb->ip_summed = CHECKSUM_UNNECESSARY; } - rcu_read_lock(); mroute6_sk = rcu_dereference(mrt->mroute_sk); if (!mroute6_sk) { - rcu_read_unlock(); kfree_skb(skb); return -EINVAL; } @@ -1119,7 +1123,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, /* Deliver to user space multicast routing algorithms */ ret = sock_queue_rcv_skb(mroute6_sk, skb); - rcu_read_unlock(); + if (ret < 0) { net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); @@ -1243,7 +1247,7 @@ static int ip6mr_device_event(struct notifier_block *this, ip6mr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { - if (v->dev == dev) + if (rcu_access_pointer(v->dev) == dev) mif6_delete(mrt, ct, 1, NULL); } } @@ -1262,7 +1266,7 @@ static int ip6mr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, - ip6mr_mr_table_iter, &mrt_lock, extack); + ip6mr_mr_table_iter, extack); } static struct notifier_block ip6_mr_notifier = { @@ -1437,12 +1441,12 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (c) { - write_lock_bh(&mrt_lock); + spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; - write_unlock_bh(&mrt_lock); + spin_unlock(&mrt_lock); call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); @@ -1560,7 +1564,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) struct net *net = sock_net(sk); rtnl_lock(); - write_lock_bh(&mrt_lock); + spin_lock(&mrt_lock); if (rtnl_dereference(mrt->mroute_sk)) { err = -EADDRINUSE; } else { @@ -1568,7 +1572,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) sock_set_flag(sk, SOCK_RCU_FREE); atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } - write_unlock_bh(&mrt_lock); + spin_unlock(&mrt_lock); if (!err) inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, @@ -1598,14 +1602,14 @@ int ip6mr_sk_done(struct sock *sk) rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { - write_lock_bh(&mrt_lock); + spin_lock(&mrt_lock); RCU_INIT_POINTER(mrt->mroute_sk, NULL); /* Note that mroute_sk had SOCK_RCU_FREE set, * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ atomic_dec(&devconf->mc_forwarding); - write_unlock_bh(&mrt_lock); + spin_unlock(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, @@ -1891,20 +1895,20 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) if (vr.mifi >= mrt->maxvif) return -EINVAL; vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); - read_lock(&mrt_lock); + rcu_read_lock(); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { - vr.icount = vif->pkt_in; - vr.ocount = vif->pkt_out; - vr.ibytes = vif->bytes_in; - vr.obytes = vif->bytes_out; - read_unlock(&mrt_lock); + vr.icount = READ_ONCE(vif->pkt_in); + vr.ocount = READ_ONCE(vif->pkt_out); + vr.ibytes = READ_ONCE(vif->bytes_in); + vr.obytes = READ_ONCE(vif->bytes_out); + rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: if (copy_from_user(&sr, arg, sizeof(sr))) @@ -1966,20 +1970,20 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) if (vr.mifi >= mrt->maxvif) return -EINVAL; vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); - read_lock(&mrt_lock); + rcu_read_lock(); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { - vr.icount = vif->pkt_in; - vr.ocount = vif->pkt_out; - vr.ibytes = vif->bytes_in; - vr.obytes = vif->bytes_out; - read_unlock(&mrt_lock); + vr.icount = READ_ONCE(vif->pkt_in); + vr.ocount = READ_ONCE(vif->pkt_out); + vr.ibytes = READ_ONCE(vif->bytes_in); + vr.obytes = READ_ONCE(vif->bytes_out); + rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: if (copy_from_user(&sr, arg, sizeof(sr))) @@ -2021,21 +2025,22 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct static int ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { - struct ipv6hdr *ipv6h; struct vif_device *vif = &mrt->vif_table[vifi]; - struct net_device *dev; + struct net_device *vif_dev; + struct ipv6hdr *ipv6h; struct dst_entry *dst; struct flowi6 fl6; - if (!vif->dev) + vif_dev = vif_dev_read(vif); + if (!vif_dev) goto out_free; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { - vif->pkt_out++; - vif->bytes_out += skb->len; - vif->dev->stats.tx_bytes += skb->len; - vif->dev->stats.tx_packets++; + WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); + WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); + vif_dev->stats.tx_bytes += skb->len; + vif_dev->stats.tx_packets++; ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); goto out_free; } @@ -2068,14 +2073,13 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ - dev = vif->dev; - skb->dev = dev; - vif->pkt_out++; - vif->bytes_out += skb->len; + skb->dev = vif_dev; + WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); + WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); /* We are about to write */ /* XXX: extension headers? */ - if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev))) + if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev))) goto out_free; ipv6h = ipv6_hdr(skb); @@ -2084,7 +2088,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, IP6CB(skb)->flags |= IP6SKB_FORWARDED; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, - net, NULL, skb, skb->dev, dev, + net, NULL, skb, skb->dev, vif_dev, ip6mr_forward2_finish); out_free: @@ -2092,17 +2096,20 @@ out_free: return 0; } +/* Called with rcu_read_lock() */ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { int ct; - for (ct = mrt->maxvif - 1; ct >= 0; ct--) { - if (mrt->vif_table[ct].dev == dev) + /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */ + for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { + if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } +/* Called under rcu_read_lock() */ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) @@ -2122,20 +2129,18 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ - rcu_read_lock(); cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) { rcu_read_unlock(); goto forward; } - rcu_read_unlock(); } /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (mrt->vif_table[vif].dev != dev) { + if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && @@ -2159,8 +2164,10 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, } forward: - mrt->vif_table[vif].pkt_in++; - mrt->vif_table[vif].bytes_in += skb->len; + WRITE_ONCE(mrt->vif_table[vif].pkt_in, + mrt->vif_table[vif].pkt_in + 1); + WRITE_ONCE(mrt->vif_table[vif].bytes_in, + mrt->vif_table[vif].bytes_in + skb->len); /* * Forward the frame @@ -2238,7 +2245,6 @@ int ip6_mr_input(struct sk_buff *skb) return err; } - read_lock(&mrt_lock); cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { @@ -2259,19 +2265,15 @@ int ip6_mr_input(struct sk_buff *skb) vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) { int err = ip6mr_cache_unresolved(mrt, vif, skb, dev); - read_unlock(&mrt_lock); return err; } - read_unlock(&mrt_lock); kfree_skb(skb); return -ENODEV; } ip6_mr_forward(net, mrt, dev, skb, cache); - read_unlock(&mrt_lock); - return 0; } @@ -2287,7 +2289,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, if (!mrt) return -ENOENT; - read_lock(&mrt_lock); + rcu_read_lock(); cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); @@ -2305,14 +2307,14 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, dev = skb->dev; if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { - read_unlock(&mrt_lock); + rcu_read_unlock(); return -ENODEV; } /* really correct? */ skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb2) { - read_unlock(&mrt_lock); + rcu_read_unlock(); return -ENOMEM; } @@ -2335,13 +2337,13 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, iph->daddr = rt->rt6i_dst.addr; err = ip6mr_cache_unresolved(mrt, vif, skb2, dev); - read_unlock(&mrt_lock); + rcu_read_unlock(); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); - read_unlock(&mrt_lock); + rcu_read_unlock(); return err; } @@ -2460,7 +2462,7 @@ static size_t mrt6msg_netlink_msgsize(size_t payloadlen) return len; } -static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) +static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b0dfe97ea4ee..cd84cbdac0a2 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -128,6 +128,7 @@ struct neigh_table nd_tbl = { [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER, [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, + [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 46b560aacc11..722de9dd0ff7 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -332,7 +332,6 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, void raw6_icmp_error(struct sk_buff *skb, int nexthdr, u8 type, u8 code, int inner_offset, __be32 info) { - const struct in6_addr *saddr, *daddr; struct net *net = dev_net(skb->dev); struct hlist_nulls_head *hlist; struct hlist_nulls_node *hnode; @@ -345,8 +344,6 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, sk_nulls_for_each(sk, hnode, hlist) { /* Note: ipv6_hdr(skb) != skb->data */ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; - saddr = &ip6h->saddr; - daddr = &ip6h->daddr; if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr, inet6_iif(skb), inet6_iif(skb))) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0be01a4d48c1..70cd50c1fa6f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4569,8 +4569,15 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, } f6i = ip6_route_info_create(&cfg, gfp_flags, NULL); - if (!IS_ERR(f6i)) + if (!IS_ERR(f6i)) { f6i->dst_nocount = true; + + if (!anycast && + (net->ipv6.devconf_all->disable_policy || + idev->cnf.disable_policy)) + f6i->dst_nopolicy = true; + } + return f6i; } @@ -5934,7 +5941,7 @@ int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) rcu_read_unlock(); if (err) - return count += w.count; + return count + w.count; } return -1; diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 6de01185cc68..d43c50a7310d 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -406,7 +406,6 @@ int __net_init seg6_hmac_net_init(struct net *net) return rhashtable_init(&sdata->hmac_infos, &rht_params); } -EXPORT_SYMBOL(seg6_hmac_net_init); void seg6_hmac_exit(void) { diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index fab89fd978f0..6b73b7a5f175 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -323,8 +323,6 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : NULL; - rcu_read_lock(); - ca = min(t->prl_count, cmax); if (!kp) { @@ -341,7 +339,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u } } - c = 0; + rcu_read_lock(); for_each_prl_rcu(t->prl) { if (c >= cmax) break; @@ -353,7 +351,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u if (kprl.addr != htonl(INADDR_ANY)) break; } -out: + rcu_read_unlock(); len = sizeof(*kp) * c; @@ -362,7 +360,7 @@ out: ret = -EFAULT; kfree(kp); - +out: return ret; } diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 9d1aafe75f92..4595b56d175d 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -184,7 +184,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) session->pwtype == L2TP_PWTYPE_PPP ? "PPP" : ""); if (session->send_seq || session->recv_seq) - seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns); + seq_printf(m, " nr %u, ns %u\n", session->nr, session->ns); seq_printf(m, " refcnt %d\n", refcount_read(&session->ref_count)); seq_printf(m, " config 0/0/%c/%c/-/%s %08x %u\n", session->recv_seq ? 'R' : '-', @@ -192,7 +192,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) session->lns_mode ? "LNS" : "LAC", 0, jiffies_to_msecs(session->reorder_timeout)); - seq_printf(m, " offset 0 l2specific %hu/%hu\n", + seq_printf(m, " offset 0 l2specific %hu/%d\n", session->l2specific_type, l2tp_get_l2specific_len(session)); if (session->cookie_len) { seq_printf(m, " cookie %02x%02x%02x%02x", @@ -215,7 +215,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) seq_puts(m, "\n"); } - seq_printf(m, " %hu/%hu tx %ld/%ld/%ld rx %ld/%ld/%ld\n", + seq_printf(m, " %u/%u tx %ld/%ld/%ld rx %ld/%ld/%ld\n", session->nr, session->ns, atomic_long_read(&session->stats.tx_packets), atomic_long_read(&session->stats.tx_bytes), diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 8be1fdc68a0b..db2e584c625e 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1553,7 +1553,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) session->lns_mode ? "LNS" : "LAC", 0, jiffies_to_msecs(session->reorder_timeout)); - seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n", + seq_printf(m, " %u/%u %ld/%ld/%ld %ld/%ld/%ld\n", session->nr, session->ns, atomic_long_read(&session->stats.tx_packets), atomic_long_read(&session->stats.tx_bytes), diff --git a/net/mptcp/options.c b/net/mptcp/options.c index be3b918a6d15..bd8f0f425be4 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -765,6 +765,7 @@ static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_bu opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); return true; } @@ -788,6 +789,7 @@ static bool mptcp_established_options_fastclose(struct sock *sk, opts->rcvr_key = msk->remote_key; pr_debug("FASTCLOSE key=%llu", opts->rcvr_key); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); return true; } @@ -809,6 +811,7 @@ static bool mptcp_established_options_mp_fail(struct sock *sk, opts->fail_seq = subflow->map_seq; pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; } @@ -833,13 +836,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); } /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); } return true; } @@ -966,7 +967,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, goto reset; subflow->mp_capable = 0; pr_fallback(msk); - __mptcp_do_fallback(msk); + mptcp_do_fallback(ssk); return false; } @@ -1583,6 +1584,9 @@ mp_rst: *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, TCPOLEN_MPTCP_PRIO, opts->backup, TCPOPT_NOP); + + MPTCP_INC_STATS(sock_net((const struct sock *)tp), + MPTCP_MIB_MPPRIOTX); } mp_capable_done: diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 59a85220edc9..45e2a48397b9 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -299,23 +299,21 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - struct sock *s = (struct sock *)msk; pr_debug("fail_seq=%llu", fail_seq); if (!READ_ONCE(msk->allow_infinite_fallback)) return; - if (!READ_ONCE(subflow->mp_fail_response_expect)) { + if (!subflow->fail_tout) { pr_debug("send MP_FAIL response and infinite map"); subflow->send_mp_fail = 1; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); subflow->send_infinite_map = 1; - } else if (!sock_flag(sk, SOCK_DEAD)) { + tcp_send_ack(sk); + } else { pr_debug("MP_FAIL response received"); - - sk_stop_timer(s, &s->sk_timer); + WRITE_ONCE(subflow->fail_tout, 0); } } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index e099f2a12504..5bdb559d5242 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -717,9 +717,10 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) } } -static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - u8 bkup) +int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup) { struct mptcp_subflow_context *subflow; @@ -727,24 +728,29 @@ static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - struct sock *sk = (struct sock *)msk; - struct mptcp_addr_info local; + struct mptcp_addr_info local, remote; + bool slow; local_address((struct sock_common *)ssk, &local); if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; + if (rem && rem->family != AF_UNSPEC) { + remote_address((struct sock_common *)ssk, &remote); + if (!mptcp_addresses_equal(&remote, rem, rem->port)) + continue; + } + + slow = lock_sock_fast(ssk); if (subflow->backup != bkup) msk->last_snd = NULL; subflow->backup = bkup; subflow->send_mp_prio = 1; subflow->request_bkup = bkup; - __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIOTX); - spin_unlock_bh(&msk->pm.lock); pr_debug("send ack for mp_prio"); - mptcp_subflow_send_ack(ssk); - spin_lock_bh(&msk->pm.lock); + __mptcp_subflow_send_ack(ssk); + unlock_sock_fast(ssk, slow); return 0; } @@ -801,7 +807,8 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, removed = true; __MPTCP_INC_STATS(sock_net(sk), rm_type); } - __set_bit(rm_list->ids[i], msk->pm.id_avail_bitmap); + if (rm_type == MPTCP_MIB_RMSUBFLOW) + __set_bit(rm_list->ids[i], msk->pm.id_avail_bitmap); if (!removed) continue; @@ -1127,7 +1134,7 @@ void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ss } unlock_sock_fast(ssk, slow); - /* always try to push the pending data regarless of re-injections: + /* always try to push the pending data regardless of re-injections: * we can possibly use backup subflows now, and subflow selection * is cheap under the msk socket lock */ @@ -1816,8 +1823,10 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, list.ids[list.nr++] = addr->id; + spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, &list); mptcp_pm_create_subflow_or_signal_addr(msk); + spin_unlock_bh(&msk->pm.lock); } static int mptcp_nl_set_flags(struct net *net, @@ -1835,12 +1844,10 @@ static int mptcp_nl_set_flags(struct net *net, goto next; lock_sock(sk); - spin_lock_bh(&msk->pm.lock); if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) - ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup); + ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) mptcp_pm_nl_fullmesh(msk, addr); - spin_unlock_bh(&msk->pm.lock); release_sock(sk); next: @@ -1854,6 +1861,9 @@ next: static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) { struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }, *entry; + struct mptcp_pm_addr_entry remote = { .addr = { .family = AF_UNSPEC }, }; + struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | @@ -1866,6 +1876,12 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; + if (attr_rem) { + ret = mptcp_pm_parse_entry(attr_rem, info, false, &remote); + if (ret < 0) + return ret; + } + if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; if (addr.addr.family == AF_UNSPEC) { @@ -1874,6 +1890,10 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } + if (token) + return mptcp_userspace_pm_set_flags(sock_net(skb->sk), + token, &addr, &remote, bkup); + spin_lock_bh(&pernet->lock); entry = __lookup_addr(pernet, &addr.addr, lookup_by_id); if (!entry) { diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index f56378e4f597..9e82250cbb70 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -5,6 +5,7 @@ */ #include "protocol.h" +#include "mib.h" void mptcp_free_local_addr_list(struct mptcp_sock *msk) { @@ -306,15 +307,11 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, const struct mptcp_addr_info *local, const struct mptcp_addr_info *remote) { - struct sock *sk = &msk->sk.icsk_inet.sk; struct mptcp_subflow_context *subflow; - struct sock *found = NULL; if (local->family != remote->family) return NULL; - lock_sock(sk); - mptcp_for_each_subflow(msk, subflow) { const struct inet_sock *issk; struct sock *ssk; @@ -347,16 +344,11 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, } if (issk->inet_sport == local->port && - issk->inet_dport == remote->port) { - found = ssk; - goto found; - } + issk->inet_dport == remote->port) + return ssk; } -found: - release_sock(sk); - - return found; + return NULL; } int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info) @@ -412,18 +404,51 @@ int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info) } sk = &msk->sk.icsk_inet.sk; + lock_sock(sk); ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); if (ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); mptcp_close_ssk(sk, ssk, subflow); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); err = 0; } else { err = -ESRCH; } + release_sock(sk); - destroy_err: +destroy_err: sock_put((struct sock *)msk); return err; } + +int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, + struct mptcp_pm_addr_entry *loc, + struct mptcp_pm_addr_entry *rem, u8 bkup) +{ + struct mptcp_sock *msk; + int ret = -EINVAL; + u32 token_val; + + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(net, token_val); + if (!msk) + return ret; + + if (!mptcp_pm_is_userspace(msk)) + goto set_flags_err; + + if (loc->addr.family == AF_UNSPEC || + rem->addr.family == AF_UNSPEC) + goto set_flags_err; + + lock_sock((struct sock *)msk); + ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup); + release_sock((struct sock *)msk); + +set_flags_err: + sock_put((struct sock *)msk); + return ret; +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e0fb9f96c45c..2caad4a3adea 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -181,8 +181,8 @@ static void mptcp_rmem_uncharge(struct sock *sk, int size) reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); /* see sk_mem_uncharge() for the rationale behind the following schema */ - if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD)) - __mptcp_rmem_reclaim(sk, SK_RECLAIM_CHUNK); + if (unlikely(reclaimable >= PAGE_SIZE)) + __mptcp_rmem_reclaim(sk, reclaimable); } static void mptcp_rfree(struct sk_buff *skb) @@ -323,20 +323,16 @@ static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) struct mptcp_sock *msk = mptcp_sk(sk); int amt, amount; - if (size < msk->rmem_fwd_alloc) + if (size <= msk->rmem_fwd_alloc) return true; + size -= msk->rmem_fwd_alloc; amt = sk_mem_pages(size); amount = amt << PAGE_SHIFT; - msk->rmem_fwd_alloc += amount; - if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) { - if (ssk->sk_forward_alloc < amount) { - msk->rmem_fwd_alloc -= amount; - return false; - } + if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) + return false; - ssk->sk_forward_alloc -= amount; - } + msk->rmem_fwd_alloc += amount; return true; } @@ -500,19 +496,24 @@ static void mptcp_set_timeout(struct sock *sk) __mptcp_set_timeout(sk, tout); } -static bool tcp_can_send_ack(const struct sock *ssk) +static inline bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } +void __mptcp_subflow_send_ack(struct sock *ssk) +{ + if (tcp_can_send_ack(ssk)) + tcp_send_ack(ssk); +} + void mptcp_subflow_send_ack(struct sock *ssk) { bool slow; slow = lock_sock_fast(ssk); - if (tcp_can_send_ack(ssk)) - tcp_send_ack(ssk); + __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } @@ -966,25 +967,6 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static void __mptcp_mem_reclaim_partial(struct sock *sk) -{ - int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk); - - lockdep_assert_held_once(&sk->sk_lock.slock); - - if (reclaimable > (int)PAGE_SIZE) - __mptcp_rmem_reclaim(sk, reclaimable - 1); - - sk_mem_reclaim(sk); -} - -static void mptcp_mem_reclaim_partial(struct sock *sk) -{ - mptcp_data_lock(sk); - __mptcp_mem_reclaim_partial(sk); - mptcp_data_unlock(sk); -} - static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); @@ -1004,7 +986,6 @@ static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - bool cleaned = false; u64 snd_una; /* on fallback we just need to ignore snd_una, as this is really @@ -1027,7 +1008,6 @@ static void __mptcp_clean_una(struct sock *sk) } dfrag_clear(sk, dfrag); - cleaned = true; } dfrag = mptcp_rtx_head(sk); @@ -1049,7 +1029,6 @@ static void __mptcp_clean_una(struct sock *sk) dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); - cleaned = true; } /* all retransmitted data acked, recovery completed */ @@ -1057,9 +1036,6 @@ static void __mptcp_clean_una(struct sock *sk) msk->recovery = false; out: - if (cleaned && tcp_under_memory_pressure(sk)) - __mptcp_mem_reclaim_partial(sk); - if (snd_una == READ_ONCE(msk->snd_nxt) && snd_una == READ_ONCE(msk->write_seq)) { if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) @@ -1211,12 +1187,6 @@ static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, boo { gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; - if (unlikely(tcp_under_memory_pressure(sk))) { - if (data_lock_held) - __mptcp_mem_reclaim_partial(sk); - else - mptcp_mem_reclaim_partial(sk); - } return __mptcp_alloc_tx_skb(sk, ssk, gfp); } @@ -1245,7 +1215,7 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk, MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); mptcp_subflow_ctx(ssk)->send_infinite_map = 0; pr_fallback(msk); - __mptcp_do_fallback(msk); + mptcp_do_fallback(ssk); } static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, @@ -2175,21 +2145,6 @@ static void mptcp_retransmit_timer(struct timer_list *t) sock_put(sk); } -static struct mptcp_subflow_context * -mp_fail_response_expect_subflow(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow, *ret = NULL; - - mptcp_for_each_subflow(msk, subflow) { - if (READ_ONCE(subflow->mp_fail_response_expect)) { - ret = subflow; - break; - } - } - - return ret; -} - static void mptcp_timeout_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); @@ -2346,6 +2301,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ + if (ssk->sk_state == TCP_LISTEN) { + tcp_set_state(ssk, TCP_CLOSE); + mptcp_subflow_queue_clean(ssk); + inet_csk_listen_stop(ssk); + } __tcp_close(ssk, 0); /* close acquired an extra ref */ @@ -2518,27 +2478,50 @@ reset_timer: mptcp_reset_timer(sk); } +/* schedule the timeout timer for the relevant event: either close timeout + * or mp_fail timeout. The close timeout takes precedence on the mp_fail one + */ +void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout) +{ + struct sock *sk = (struct sock *)msk; + unsigned long timeout, close_timeout; + + if (!fail_tout && !sock_flag(sk, SOCK_DEAD)) + return; + + close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN; + + /* the close timeout takes precedence on the fail one, and here at least one of + * them is active + */ + timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout; + + sk_reset_timer(sk, &sk->sk_timer, timeout); +} + static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) { - struct mptcp_subflow_context *subflow; - struct sock *ssk; + struct sock *ssk = msk->first; bool slow; - subflow = mp_fail_response_expect_subflow(msk); - if (subflow) { - pr_debug("MP_FAIL doesn't respond, reset the subflow"); + if (!ssk) + return; + + pr_debug("MP_FAIL doesn't respond, reset the subflow"); - ssk = mptcp_subflow_tcp_sock(subflow); - slow = lock_sock_fast(ssk); - mptcp_subflow_reset(ssk); - unlock_sock_fast(ssk, slow); - } + slow = lock_sock_fast(ssk); + mptcp_subflow_reset(ssk); + WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); + unlock_sock_fast(ssk, slow); + + mptcp_reset_timeout(msk, 0); } static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *sk = &msk->sk.icsk_inet.sk; + unsigned long fail_tout; int state; lock_sock(sk); @@ -2575,7 +2558,9 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); - mptcp_mp_fail_no_response(msk); + fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; + if (fail_tout && time_after(jiffies, fail_tout)) + mptcp_mp_fail_no_response(msk); unlock: release_sock(sk); @@ -2822,6 +2807,7 @@ static void __mptcp_destroy_sock(struct sock *sk) static void mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; lock_sock(sk); @@ -2840,10 +2826,16 @@ static void mptcp_close(struct sock *sk, long timeout) cleanup: /* orphan all the subflows */ inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; - mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); + /* since the close timeout takes precedence on the fail one, + * cancel the latter + */ + if (ssk == msk->first) + subflow->fail_tout = 0; + sock_orphan(ssk); unlock_sock_fast(ssk, slow); } @@ -2852,13 +2844,13 @@ cleanup: sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); if (mptcp_sk(sk)->token) - mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); do_cancel_work = true; } else { - sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); + mptcp_reset_timeout(msk, 0); } release_sock(sk); if (do_cancel_work) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 200f89f6d62f..07871e10e510 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -83,7 +83,6 @@ /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) -#define MPTCPOPT_HMAC_LEN 20 #define MPTCPOPT_THMAC_LEN 8 /* MPTCP MP_CAPABLE flags */ @@ -306,6 +305,7 @@ struct mptcp_sock { u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; + struct mptcp_sock *dl_next; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) @@ -468,7 +468,6 @@ struct mptcp_subflow_context { local_id_valid : 1, /* local_id is correctly initialized */ valid_csum_seen : 1; /* at least one csum validated */ enum mptcp_data_avail data_avail; - bool mp_fail_response_expect; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -482,6 +481,7 @@ struct mptcp_subflow_context { u8 stale_count; long delegated_status; + unsigned long fail_tout; ); @@ -606,8 +606,10 @@ void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); +void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); @@ -662,6 +664,7 @@ void mptcp_get_options(const struct sk_buff *skb, void mptcp_finish_connect(struct sock *sk); void __mptcp_set_connected(struct sock *sk); +void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout); static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && @@ -768,6 +771,10 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); +int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_pm_addr_entry *entry); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); @@ -784,7 +791,9 @@ int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex); - +int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, + struct mptcp_pm_addr_entry *loc, + struct mptcp_pm_addr_entry *rem, u8 bkup); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); @@ -926,12 +935,25 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk) set_bit(MPTCP_FALLBACK_DONE, &msk->flags); } -static inline void mptcp_do_fallback(struct sock *sk) +static inline void mptcp_do_fallback(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + struct mptcp_sock *msk; + msk = mptcp_sk(sk); __mptcp_do_fallback(msk); + if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { + gfp_t saved_allocation = ssk->sk_allocation; + + /* we are in a atomic (BH) scope, override ssk default for data + * fin allocation + */ + ssk->sk_allocation = GFP_ATOMIC; + ssk->sk_shutdown |= SEND_SHUTDOWN; + tcp_shutdown(ssk, SEND_SHUTDOWN); + ssk->sk_allocation = saved_allocation; + } } #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 8841e8cd9ad8..d4b16d033978 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -843,7 +843,8 @@ enum mapping_status { MAPPING_INVALID, MAPPING_EMPTY, MAPPING_DATA_FIN, - MAPPING_DUMMY + MAPPING_DUMMY, + MAPPING_BAD_CSUM }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) @@ -958,11 +959,7 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); - if (subflow->mp_join || subflow->valid_csum_seen) { - subflow->send_mp_fail = 1; - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX); - } - return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; + return MAPPING_BAD_CSUM; } subflow->valid_csum_seen = 1; @@ -974,7 +971,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool csum_reqd = READ_ONCE(msk->csum_enabled); - struct sock *sk = (struct sock *)msk; struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; @@ -1016,9 +1012,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk, pr_debug("infinite mapping received"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); subflow->map_data_len = 0; - if (!sock_flag(ssk, SOCK_DEAD)) - sk_stop_timer(sk, &sk->sk_timer); - return MAPPING_INVALID; } @@ -1165,6 +1158,33 @@ static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) return !subflow->fully_established; } +static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + unsigned long fail_tout; + + /* greceful failure can happen only on the MPC subflow */ + if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) + return; + + /* since the close timeout take precedence on the fail one, + * no need to start the latter when the first is already set + */ + if (sock_flag((struct sock *)msk, SOCK_DEAD)) + return; + + /* we don't need extreme accuracy here, use a zero fail_tout as special + * value meaning no fail timeout at all; + */ + fail_tout = jiffies + TCP_RTO_MAX; + if (!fail_tout) + fail_tout = 1; + WRITE_ONCE(subflow->fail_tout, fail_tout); + tcp_send_ack(ssk); + + mptcp_reset_timeout(msk, subflow->fail_tout); +} + static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); @@ -1184,10 +1204,8 @@ static bool subflow_check_data_avail(struct sock *ssk) status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); - if (unlikely(status == MAPPING_INVALID)) - goto fallback; - - if (unlikely(status == MAPPING_DUMMY)) + if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || + status == MAPPING_BAD_CSUM)) goto fallback; if (status != MAPPING_OK) @@ -1229,22 +1247,17 @@ no_data: fallback: if (!__mptcp_check_fallback(msk)) { /* RFC 8684 section 3.7. */ - if (subflow->send_mp_fail) { + if (status == MAPPING_BAD_CSUM && + (subflow->mp_join || subflow->valid_csum_seen)) { + subflow->send_mp_fail = 1; + if (!READ_ONCE(msk->allow_infinite_fallback)) { - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; - tcp_send_active_reset(ssk, GFP_ATOMIC); - while ((skb = skb_peek(&ssk->sk_receive_queue))) - sk_eat_skb(ssk, skb); - } else if (!sock_flag(ssk, SOCK_DEAD)) { - WRITE_ONCE(subflow->mp_fail_response_expect, true); - sk_reset_timer((struct sock *)msk, - &((struct sock *)msk)->sk_timer, - jiffies + TCP_RTO_MAX); + goto reset; } - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + mptcp_subflow_fail(msk, ssk); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); return true; } @@ -1252,16 +1265,20 @@ fallback: /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMPTCP; + +reset: + ssk->sk_err = EBADMSG; + tcp_set_state(ssk, TCP_CLOSE); + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); tcp_send_active_reset(ssk, GFP_ATOMIC); WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); return false; } - __mptcp_do_fallback(msk); + mptcp_do_fallback(ssk); } skb = skb_peek(&ssk->sk_receive_queue); @@ -1617,7 +1634,7 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) /* the newly created socket really belongs to the owning MPTCP master * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the - * procfs/diag interaces really show this one belonging to the correct + * procfs/diag interfaces really show this one belonging to the correct * user. */ SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; @@ -1706,6 +1723,58 @@ static void subflow_state_change(struct sock *sk) } } +void mptcp_subflow_queue_clean(struct sock *listener_ssk) +{ + struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; + struct mptcp_sock *msk, *next, *head = NULL; + struct request_sock *req; + + /* build a list of all unaccepted mptcp sockets */ + spin_lock_bh(&queue->rskq_lock); + for (req = queue->rskq_accept_head; req; req = req->dl_next) { + struct mptcp_subflow_context *subflow; + struct sock *ssk = req->sk; + struct mptcp_sock *msk; + + if (!sk_is_mptcp(ssk)) + continue; + + subflow = mptcp_subflow_ctx(ssk); + if (!subflow || !subflow->conn) + continue; + + /* skip if already in list */ + msk = mptcp_sk(subflow->conn); + if (msk->dl_next || msk == head) + continue; + + msk->dl_next = head; + head = msk; + } + spin_unlock_bh(&queue->rskq_lock); + if (!head) + return; + + /* can't acquire the msk socket lock under the subflow one, + * or will cause ABBA deadlock + */ + release_sock(listener_ssk); + + for (msk = head; msk; msk = next) { + struct sock *sk = (struct sock *)msk; + bool slow; + + slow = lock_sock_fast_nested(sk); + next = msk->dl_next; + msk->first = NULL; + msk->dl_next = NULL; + unlock_sock_fast(sk, slow); + } + + /* we are still under the listener msk socket lock */ + lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); +} + static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 78814417d753..80713febfac6 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1803,7 +1803,8 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, pdev = to_platform_device(dev->dev.parent); if (pdev) { np = pdev->dev.of_node; - if (np && of_get_property(np, "mlx,multi-host", NULL)) + if (np && (of_get_property(np, "mellanox,multi-host", NULL) || + of_get_property(np, "mlx,multi-host", NULL))) ndp->mlx_multi_host = true; } diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index 7873bd1389c3..a8e2425e43b0 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -13,14 +13,31 @@ #include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> -static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) +#define NF_RECURSION_LIMIT 2 + +static DEFINE_PER_CPU(u8, nf_dup_skb_recursion); + +static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, + enum nf_dev_hooks hook) { - if (skb_mac_header_was_set(skb)) + if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT) + goto err; + + if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) { + if (skb_cow_head(skb, skb->mac_len)) + goto err; + skb_push(skb, skb->mac_len); + } skb->dev = dev; skb_clear_tstamp(skb); + __this_cpu_inc(nf_dup_skb_recursion); dev_queue_xmit(skb); + __this_cpu_dec(nf_dup_skb_recursion); + return; +err: + kfree_skb(skb); } void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif) @@ -33,7 +50,7 @@ void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif) return; } - nf_do_netdev_egress(pkt->skb, dev); + nf_do_netdev_egress(pkt->skb, dev, nft_hook(pkt)); } EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress); @@ -48,7 +65,7 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif) skb = skb_clone(pkt->skb, GFP_ATOMIC); if (skb) - nf_do_netdev_egress(skb, dev); + nf_do_netdev_egress(skb, dev, nft_hook(pkt)); } EXPORT_SYMBOL_GPL(nf_dup_netdev_egress); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 51144fc66889..d6b59beab3a9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5213,13 +5213,20 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, struct nft_data *data, struct nlattr *attr) { + u32 dtype; int err; err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); if (err < 0) return err; - if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) { + if (set->dtype == NFT_DATA_VERDICT) + dtype = NFT_DATA_VERDICT; + else + dtype = NFT_DATA_VALUE; + + if (dtype != desc->type || + set->dlen != desc->len) { nft_data_release(data, desc->type); return -EINVAL; } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 53f40e473855..3ddce24ac76d 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -25,9 +25,7 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, enum nft_trace_types type) { - const struct nft_pktinfo *pkt = info->pkt; - - if (!info->trace || !pkt->skb->nf_trace) + if (!info->trace || !info->nf_trace) return; info->chain = chain; @@ -42,11 +40,24 @@ static inline void nft_trace_packet(struct nft_traceinfo *info, enum nft_trace_types type) { if (static_branch_unlikely(&nft_trace_enabled)) { + const struct nft_pktinfo *pkt = info->pkt; + + info->nf_trace = pkt->skb->nf_trace; info->rule = rule; __nft_trace_packet(info, chain, type); } } +static inline void nft_trace_copy_nftrace(struct nft_traceinfo *info) +{ + if (static_branch_unlikely(&nft_trace_enabled)) { + const struct nft_pktinfo *pkt = info->pkt; + + if (info->trace) + info->nf_trace = pkt->skb->nf_trace; + } +} + static void nft_bitwise_fast_eval(const struct nft_expr *expr, struct nft_regs *regs) { @@ -85,6 +96,7 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info, const struct nft_chain *chain, const struct nft_regs *regs) { + const struct nft_pktinfo *pkt = info->pkt; enum nft_trace_types type; switch (regs->verdict.code) { @@ -92,8 +104,13 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info, case NFT_RETURN: type = NFT_TRACETYPE_RETURN; break; + case NF_STOLEN: + type = NFT_TRACETYPE_RULE; + /* can't access skb->nf_trace; use copy */ + break; default: type = NFT_TRACETYPE_RULE; + info->nf_trace = pkt->skb->nf_trace; break; } @@ -254,6 +271,7 @@ next_rule: switch (regs.verdict.code) { case NFT_BREAK: regs.verdict.code = NFT_CONTINUE; + nft_trace_copy_nftrace(&info); continue; case NFT_CONTINUE: nft_trace_packet(&info, chain, rule, diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 5041725423c2..1163ba9c1401 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -7,7 +7,7 @@ #include <linux/module.h> #include <linux/static_key.h> #include <linux/hash.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/skbuff.h> @@ -25,22 +25,6 @@ DEFINE_STATIC_KEY_FALSE(nft_trace_enabled); EXPORT_SYMBOL_GPL(nft_trace_enabled); -static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb) -{ - __be32 id; - - /* using skb address as ID results in a limited number of - * values (and quick reuse). - * - * So we attempt to use as many skb members that will not - * change while skb is with netfilter. - */ - id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb), - skb->skb_iif); - - return nla_put_be32(nlskb, NFTA_TRACE_ID, id); -} - static int trace_fill_header(struct sk_buff *nlskb, u16 type, const struct sk_buff *skb, int off, unsigned int len) @@ -186,6 +170,7 @@ void nft_trace_notify(struct nft_traceinfo *info) struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int size; + u32 mark = 0; u16 event; if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE)) @@ -229,7 +214,7 @@ void nft_trace_notify(struct nft_traceinfo *info) if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type))) goto nla_put_failure; - if (trace_fill_id(skb, pkt->skb)) + if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid)) goto nla_put_failure; if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name)) @@ -249,16 +234,24 @@ void nft_trace_notify(struct nft_traceinfo *info) case NFT_TRACETYPE_RULE: if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict)) goto nla_put_failure; + + /* pkt->skb undefined iff NF_STOLEN, disable dump */ + if (info->verdict->code == NF_STOLEN) + info->packet_dumped = true; + else + mark = pkt->skb->mark; + break; case NFT_TRACETYPE_POLICY: + mark = pkt->skb->mark; + if (nla_put_be32(skb, NFTA_TRACE_POLICY, htonl(info->basechain->policy))) goto nla_put_failure; break; } - if (pkt->skb->mark && - nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark))) + if (mark && nla_put_be32(skb, NFTA_TRACE_MARK, htonl(mark))) goto nla_put_failure; if (!info->packet_dumped) { @@ -283,9 +276,20 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_chain *chain) { + static siphash_key_t trace_key __read_mostly; + struct sk_buff *skb = pkt->skb; + info->basechain = nft_base_chain(chain); info->trace = true; + info->nf_trace = pkt->skb->nf_trace; info->packet_dumped = false; info->pkt = pkt; info->verdict = verdict; + + net_get_random_once(&trace_key, sizeof(trace_key)); + + info->skbid = (u32)siphash_3u32(hash32_ptr(skb), + skb_get_hash(skb), + skb->skb_iif, + &trace_key); } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index af15102bc696..f466af4f8531 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -614,7 +614,7 @@ static void __net_exit cttimeout_net_exit(struct net *net) nf_ct_untimeout(net, NULL); - list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, head) { + list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) { list_del(&cur->free_head); if (refcount_dec_and_test(&cur->refcnt)) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index ac4859241e17..55d2d49c3425 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -14,6 +14,7 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/random.h> #include <linux/smp.h> #include <linux/static_key.h> #include <net/dst.h> @@ -32,8 +33,6 @@ #define NFT_META_SECS_PER_DAY 86400 #define NFT_META_DAYS_PER_WEEK 7 -static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); - static u8 nft_meta_weekday(void) { time64_t secs = ktime_get_real_seconds(); @@ -271,13 +270,6 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, return true; } -static noinline u32 nft_prandom_u32(void) -{ - struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); - - return prandom_u32_state(state); -} - #ifdef CONFIG_IP_ROUTE_CLASSID static noinline bool nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest) @@ -389,7 +381,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; #endif case NFT_META_PRANDOM: - *dest = nft_prandom_u32(); + *dest = get_random_u32(); break; #ifdef CONFIG_XFRM case NFT_META_SECPATH: @@ -518,7 +510,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx, len = IFNAMSIZ; break; case NFT_META_PRANDOM: - prandom_init_once(&nft_prandom_state); len = sizeof(u32); break; #ifdef CONFIG_XFRM diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 81b40c663d86..45d3dc9e96f2 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -9,12 +9,11 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/random.h> #include <linux/static_key.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> -static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state); - struct nft_ng_inc { u8 dreg; u32 modulus; @@ -135,12 +134,9 @@ struct nft_ng_random { u32 offset; }; -static u32 nft_ng_random_gen(struct nft_ng_random *priv) +static u32 nft_ng_random_gen(const struct nft_ng_random *priv) { - struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state); - - return reciprocal_scale(prandom_u32_state(state), priv->modulus) + - priv->offset; + return reciprocal_scale(get_random_u32(), priv->modulus) + priv->offset; } static void nft_ng_random_eval(const struct nft_expr *expr, @@ -168,8 +164,6 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; - prandom_init_once(&nft_numgen_prandom_state); - return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, sizeof(u32)); } diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index df40314de21f..76de6c8d9865 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -143,6 +143,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, /* Another cpu may race to insert the element with the same key */ if (prev) { nft_set_elem_destroy(set, he, true); + atomic_dec(&set->nelems); he = prev; } @@ -152,6 +153,7 @@ out: err2: nft_set_elem_destroy(set, he, true); + atomic_dec(&set->nelems); err1: return false; } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 2c8051d8cca6..4f9299b9dcdd 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2125,6 +2125,32 @@ out_scratch: } /** + * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array + * @set: nftables API set representation + * @m: matching data pointing to key mapping array + */ +static void nft_set_pipapo_match_destroy(const struct nft_set *set, + struct nft_pipapo_match *m) +{ + struct nft_pipapo_field *f; + int i, r; + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; + + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; + + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; + + e = f->mt[r].e; + + nft_set_elem_destroy(set, e, true); + } +} + +/** * nft_pipapo_destroy() - Free private data for set and all committed elements * @set: nftables API set representation */ @@ -2132,26 +2158,13 @@ static void nft_pipapo_destroy(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; - struct nft_pipapo_field *f; - int i, r, cpu; + int cpu; m = rcu_dereference_protected(priv->match, true); if (m) { rcu_barrier(); - for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) - ; - - for (r = 0; r < f->rules; r++) { - struct nft_pipapo_elem *e; - - if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) - continue; - - e = f->mt[r].e; - - nft_set_elem_destroy(set, e, true); - } + nft_set_pipapo_match_destroy(set, m); #ifdef NFT_PIPAPO_ALIGN free_percpu(m->scratch_aligned); @@ -2165,6 +2178,11 @@ static void nft_pipapo_destroy(const struct nft_set *set) } if (priv->clone) { + m = priv->clone; + + if (priv->dirty) + nft_set_pipapo_match_destroy(set, m); + #ifdef NFT_PIPAPO_ALIGN free_percpu(priv->clone->scratch_aligned); #endif diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 372bf54a0ca9..e20d1a973417 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -407,7 +407,7 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) if (flags & IP6_FH_F_FRAG) { if (frag_off) { key->ip.frag = OVS_FRAG_TYPE_LATER; - key->ip.proto = nexthdr; + key->ip.proto = NEXTHDR_FRAGMENT; return 0; } key->ip.frag = OVS_FRAG_TYPE_FIRST; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index fee6409c2bb3..eb0b8197ac82 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -227,8 +227,8 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) { struct rose_neigh *s; - rose_stop_ftimer(rose_neigh); - rose_stop_t0timer(rose_neigh); + del_timer_sync(&rose_neigh->ftimer); + del_timer_sync(&rose_neigh->t0timer); skb_queue_purge(&rose_neigh->queue); diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index b3138fc2e552..f06ddbed3fed 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -31,89 +31,89 @@ static void rose_idletimer_expiry(struct timer_list *); void rose_start_heartbeat(struct sock *sk) { - del_timer(&sk->sk_timer); + sk_stop_timer(sk, &sk->sk_timer); sk->sk_timer.function = rose_heartbeat_expiry; sk->sk_timer.expires = jiffies + 5 * HZ; - add_timer(&sk->sk_timer); + sk_reset_timer(sk, &sk->sk_timer, sk->sk_timer.expires); } void rose_start_t1timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t1; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t2timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t2; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t3timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t3; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_hbtimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->hb; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_idletimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->idletimer); + sk_stop_timer(sk, &rose->idletimer); if (rose->idle > 0) { rose->idletimer.function = rose_idletimer_expiry; rose->idletimer.expires = jiffies + rose->idle; - add_timer(&rose->idletimer); + sk_reset_timer(sk, &rose->idletimer, rose->idletimer.expires); } } void rose_stop_heartbeat(struct sock *sk) { - del_timer(&sk->sk_timer); + sk_stop_timer(sk, &sk->sk_timer); } void rose_stop_timer(struct sock *sk) { - del_timer(&rose_sk(sk)->timer); + sk_stop_timer(sk, &rose_sk(sk)->timer); } void rose_stop_idletimer(struct sock *sk) { - del_timer(&rose_sk(sk)->idletimer); + sk_stop_timer(sk, &rose_sk(sk)->idletimer); } static void rose_heartbeat_expiry(struct timer_list *t) @@ -130,6 +130,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { bh_unlock_sock(sk); rose_destroy_socket(sk); + sock_put(sk); return; } break; @@ -152,6 +153,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) rose_start_heartbeat(sk); bh_unlock_sock(sk); + sock_put(sk); } static void rose_timer_expiry(struct timer_list *t) @@ -181,6 +183,7 @@ static void rose_timer_expiry(struct timer_list *t) break; } bh_unlock_sock(sk); + sock_put(sk); } static void rose_idletimer_expiry(struct timer_list *t) @@ -205,4 +208,5 @@ static void rose_idletimer_expiry(struct timer_list *t) sock_set_flag(sk, SOCK_DEAD); } bh_unlock_sock(sk); + sock_put(sk); } diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 08aab5c01437..258917a714c8 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -431,7 +431,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, break; } - _leave(" = %d [set %hx]", ret, y); + _leave(" = %d [set %x]", ret, y); return ret; } diff --git a/net/sched/act_api.c b/net/sched/act_api.c index da9733da9868..817065aa2833 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -588,7 +588,8 @@ static int tcf_idr_release_unsafe(struct tc_action *p) } static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct nlattr *nest; int n_i = 0; @@ -604,20 +605,25 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; + ret = 0; mutex_lock(&idrinfo->lock); idr_for_each_entry_ul(idr, p, tmp, id) { if (IS_ERR(p)) continue; ret = tcf_idr_release_unsafe(p); - if (ret == ACT_P_DELETED) { + if (ret == ACT_P_DELETED) module_put(ops->owner); - n_i++; - } else if (ret < 0) { - mutex_unlock(&idrinfo->lock); - goto nla_put_failure; - } + else if (ret < 0) + break; + n_i++; } mutex_unlock(&idrinfo->lock); + if (ret < 0) { + if (n_i) + NL_SET_ERR_MSG(extack, "Unable to flush all TC actions"); + else + goto nla_put_failure; + } ret = nla_put_u32(skb, TCA_FCNT, n_i); if (ret) @@ -638,7 +644,7 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct tcf_idrinfo *idrinfo = tn->idrinfo; if (type == RTM_DELACTION) { - return tcf_del_walker(idrinfo, skb, ops); + return tcf_del_walker(idrinfo, skb, ops, extack); } else if (type == RTM_GETACTION) { return tcf_dump_walker(idrinfo, skb, cb); } else { diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 79c8901f66ab..b759628a47c2 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -442,7 +442,7 @@ static int tcf_police_act_to_flow_act(int tc_act, u32 *extval, act_id = FLOW_ACTION_JUMP; *extval = tc_act & TC_ACT_EXT_VAL_MASK; } else if (tc_act == TC_ACT_UNSPEC) { - NL_SET_ERR_MSG_MOD(extack, "Offload not supported when conform/exceed action is \"continue\""); + act_id = FLOW_ACTION_CONTINUE; } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported conform/exceed action offload"); } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index ed4ccef5d6a8..5449ed114e40 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -1146,9 +1146,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_rate rate; struct tc_netem_slot slot; - qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency), + qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency), UINT_MAX); - qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter), + qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter), UINT_MAX); qopt.limit = q->limit; qopt.loss = q->loss; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index b9c71a304d39..0b941dd63d26 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> +#include <linux/time.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -176,7 +177,7 @@ static ktime_t get_interval_end_time(struct sched_gate_list *sched, static int length_to_duration(struct taprio_sched *q, int len) { - return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); + return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); } /* Returns the entry corresponding to next available interval. If @@ -551,7 +552,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) { atomic_set(&entry->budget, - div64_u64((u64)entry->interval * 1000, + div64_u64((u64)entry->interval * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte))); } diff --git a/net/socket.c b/net/socket.c index 1b6f5e2ebef5..3d7eb2a79e82 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2146,10 +2146,13 @@ SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, struct sockaddr __user *addr, int __user *addr_len) { + struct sockaddr_storage address; + struct msghdr msg = { + /* Save some cycles and don't copy the address if not needed */ + .msg_name = addr ? (struct sockaddr *)&address : NULL, + }; struct socket *sock; struct iovec iov; - struct msghdr msg; - struct sockaddr_storage address; int err, err2; int fput_needed; @@ -2160,14 +2163,6 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, if (!sock) goto out; - msg.msg_control = NULL; - msg.msg_controllen = 0; - /* Save some cycles and don't copy the address if not needed */ - msg.msg_name = addr ? (struct sockaddr *)&address : NULL; - /* We assume all kernel code knows the size of sockaddr_storage */ - msg.msg_namelen = 0; - msg.msg_iocb = NULL; - msg.msg_flags = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = sock_recvmsg(sock, &msg, flags); @@ -2372,6 +2367,7 @@ int __copy_msghdr_from_user(struct msghdr *kmsg, return -EFAULT; kmsg->msg_control_is_user = true; + kmsg->msg_get_inq = 0; kmsg->msg_control_user = msg.msg_control; kmsg->msg_controllen = msg.msg_controllen; kmsg->msg_flags = msg.msg_flags; diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 1a72c67afed5..8299ceb3e373 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -533,6 +533,9 @@ EXPORT_SYMBOL_GPL(strp_check_rcv); static int __init strp_dev_init(void) { + BUILD_BUG_ON(sizeof(struct sk_skb_cb) > + sizeof_field(struct sk_buff, cb)); + strp_wq = create_singlethread_workqueue("kstrp"); if (unlikely(!strp_wq)) return -ENOMEM; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index e2c6eca0271b..b6781ada3aa8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -651,6 +651,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; new->cl_principal = clnt->cl_principal; + new->cl_max_connect = clnt->cl_max_connect; return new; out_err: diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index f87a2d8f23a7..5d2b3e6979fb 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -984,7 +984,7 @@ static noinline __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, p = page_address(*xdr->page_ptr); xdr->p = p + frag2bytes; space_left = xdr->buf->buflen - xdr->buf->len; - if (space_left - nbytes >= PAGE_SIZE) + if (space_left - frag1bytes >= PAGE_SIZE) xdr->end = p + PAGE_SIZE; else xdr->end = p + space_left - frag1bytes; diff --git a/net/tipc/core.c b/net/tipc/core.c index 3f4542e0f065..434e70eabe08 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -109,10 +109,9 @@ static void __net_exit tipc_exit_net(struct net *net) struct tipc_net *tn = tipc_net(net); tipc_detach_loopback(net); + tipc_net_stop(net); /* Make sure the tipc_net_finalize_work() finished */ cancel_work_sync(&tn->work); - tipc_net_stop(net); - tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); diff --git a/net/tipc/node.c b/net/tipc/node.c index 6ef95ce565bd..b48d97cbbe29 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -472,8 +472,8 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_link *l, *snd_l = tipc_bc_sndlink(net); struct tipc_node *n, *temp_node; - struct tipc_link *l; unsigned long intv; int bearer_id; int i; @@ -488,6 +488,16 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, goto exit; /* A preliminary node becomes "real" now, refresh its data */ tipc_node_write_lock(n); + if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, + tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), + n->capabilities, &n->bc_entry.inputq1, + &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { + pr_warn("Broadcast rcv link refresh failed, no memory\n"); + tipc_node_write_unlock_fast(n); + tipc_node_put(n); + n = NULL; + goto exit; + } n->preliminary = false; n->addr = addr; hlist_del_rcu(&n->hash); @@ -567,7 +577,16 @@ update: n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; - n->bc_entry.link = NULL; + if (!preliminary && + !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, + tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), + n->capabilities, &n->bc_entry.inputq1, + &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { + pr_warn("Broadcast rcv link creation failed, no memory\n"); + kfree(n); + n = NULL; + goto exit; + } tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); /* Start a slow timer anyway, crypto needs it */ @@ -1155,7 +1174,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, bool *respond, bool *dupl_addr) { struct tipc_node *n; - struct tipc_link *l, *snd_l; + struct tipc_link *l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; @@ -1175,22 +1194,6 @@ void tipc_node_check_dest(struct net *net, u32 addr, return; tipc_node_write_lock(n); - if (unlikely(!n->bc_entry.link)) { - snd_l = tipc_bc_sndlink(net); - if (!tipc_link_bc_create(net, tipc_own_addr(net), - addr, peer_id, U16_MAX, - tipc_link_min_win(snd_l), - tipc_link_max_win(snd_l), - n->capabilities, - &n->bc_entry.inputq1, - &n->bc_entry.namedq, snd_l, - &n->bc_entry.link)) { - pr_warn("Broadcast rcv link creation failed, no mem\n"); - tipc_node_write_unlock_fast(n); - tipc_node_put(n); - return; - } - } le = &n->links[b->identity]; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 17f8c523e33b..43509c7e90fc 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -502,6 +502,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sock_init_data(sock, sk); tipc_set_sk_state(sk, TIPC_OPEN); if (tipc_sk_insert(tsk)) { + sk_free(sk); pr_warn("Socket create failed; port number exhausted\n"); return -EINVAL; } diff --git a/net/tls/tls.h b/net/tls/tls.h new file mode 100644 index 000000000000..8005ee25157d --- /dev/null +++ b/net/tls/tls.h @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. + * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _TLS_INT_H +#define _TLS_INT_H + +#include <asm/byteorder.h> +#include <linux/types.h> +#include <linux/skmsg.h> +#include <net/tls.h> + +#define __TLS_INC_STATS(net, field) \ + __SNMP_INC_STATS((net)->mib.tls_statistics, field) +#define TLS_INC_STATS(net, field) \ + SNMP_INC_STATS((net)->mib.tls_statistics, field) +#define TLS_DEC_STATS(net, field) \ + SNMP_DEC_STATS((net)->mib.tls_statistics, field) + +/* TLS records are maintained in 'struct tls_rec'. It stores the memory pages + * allocated or mapped for each TLS record. After encryption, the records are + * stores in a linked list. + */ +struct tls_rec { + struct list_head list; + int tx_ready; + int tx_flags; + + struct sk_msg msg_plaintext; + struct sk_msg msg_encrypted; + + /* AAD | msg_plaintext.sg.data | sg_tag */ + struct scatterlist sg_aead_in[2]; + /* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */ + struct scatterlist sg_aead_out[2]; + + char content_type; + struct scatterlist sg_content_type; + + char aad_space[TLS_AAD_SPACE_SIZE]; + u8 iv_data[MAX_IV_SIZE]; + struct aead_request aead_req; + u8 aead_req_ctx[]; +}; + +int __net_init tls_proc_init(struct net *net); +void __net_exit tls_proc_fini(struct net *net); + +struct tls_context *tls_ctx_create(struct sock *sk); +void tls_ctx_free(struct sock *sk, struct tls_context *ctx); +void update_sk_prot(struct sock *sk, struct tls_context *ctx); + +int wait_on_pending_writer(struct sock *sk, long *timeo); +int tls_sk_query(struct sock *sk, int optname, char __user *optval, + int __user *optlen); +int tls_sk_attach(struct sock *sk, int optname, char __user *optval, + unsigned int optlen); +void tls_err_abort(struct sock *sk, int err); + +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); +void tls_update_rx_zc_capable(struct tls_context *tls_ctx); +void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); +void tls_sw_strparser_done(struct tls_context *tls_ctx); +int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +int tls_sw_sendpage_locked(struct sock *sk, struct page *page, + int offset, size_t size, int flags); +int tls_sw_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); +void tls_sw_cancel_work_tx(struct tls_context *tls_ctx); +void tls_sw_release_resources_tx(struct sock *sk); +void tls_sw_free_ctx_tx(struct tls_context *tls_ctx); +void tls_sw_free_resources_rx(struct sock *sk); +void tls_sw_release_resources_rx(struct sock *sk); +void tls_sw_free_ctx_rx(struct tls_context *tls_ctx); +int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int flags, int *addr_len); +bool tls_sw_sock_is_readable(struct sock *sk); +ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); + +int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +int tls_device_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); +int tls_tx_records(struct sock *sk, int flags); + +void tls_sw_write_space(struct sock *sk, struct tls_context *ctx); +void tls_device_write_space(struct sock *sk, struct tls_context *ctx); + +int tls_process_cmsg(struct sock *sk, struct msghdr *msg, + unsigned char *record_type); +int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout); + +int tls_sw_fallback_init(struct sock *sk, + struct tls_offload_context_tx *offload_ctx, + struct tls_crypto_info *crypto_info); + +static inline struct tls_msg *tls_msg(struct sk_buff *skb) +{ + struct sk_skb_cb *scb = (struct sk_skb_cb *)skb->cb; + + return &scb->tls; +} + +#ifdef CONFIG_TLS_DEVICE +void tls_device_init(void); +void tls_device_cleanup(void); +int tls_set_device_offload(struct sock *sk, struct tls_context *ctx); +void tls_device_free_resources_tx(struct sock *sk); +int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); +void tls_device_offload_cleanup_rx(struct sock *sk); +void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq); +int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, + struct sk_buff *skb, struct strp_msg *rxm); +#else +static inline void tls_device_init(void) {} +static inline void tls_device_cleanup(void) {} + +static inline int +tls_set_device_offload(struct sock *sk, struct tls_context *ctx) +{ + return -EOPNOTSUPP; +} + +static inline void tls_device_free_resources_tx(struct sock *sk) {} + +static inline int +tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) +{ + return -EOPNOTSUPP; +} + +static inline void tls_device_offload_cleanup_rx(struct sock *sk) {} +static inline void +tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {} + +static inline int +tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, + struct sk_buff *skb, struct strp_msg *rxm) +{ + return 0; +} +#endif + +int tls_push_sg(struct sock *sk, struct tls_context *ctx, + struct scatterlist *sg, u16 first_offset, + int flags); +int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, + int flags); +void tls_free_partial_record(struct sock *sk, struct tls_context *ctx); + +static inline bool tls_is_partially_sent_record(struct tls_context *ctx) +{ + return !!ctx->partially_sent_record; +} + +static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) +{ + return tls_ctx->pending_open_record_frags; +} + +static inline bool tls_bigint_increment(unsigned char *seq, int len) +{ + int i; + + for (i = len - 1; i >= 0; i--) { + ++seq[i]; + if (seq[i] != 0) + break; + } + + return (i == -1); +} + +static inline void tls_bigint_subtract(unsigned char *seq, int n) +{ + u64 rcd_sn; + __be64 *p; + + BUILD_BUG_ON(TLS_MAX_REC_SEQ_SIZE != 8); + + p = (__be64 *)seq; + rcd_sn = be64_to_cpu(*p); + *p = cpu_to_be64(rcd_sn - n); +} + +static inline void +tls_advance_record_sn(struct sock *sk, struct tls_prot_info *prot, + struct cipher_context *ctx) +{ + if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size)) + tls_err_abort(sk, -EBADMSG); + + if (prot->version != TLS_1_3_VERSION && + prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) + tls_bigint_increment(ctx->iv + prot->salt_size, + prot->iv_size); +} + +static inline void +tls_xor_iv_with_seq(struct tls_prot_info *prot, char *iv, char *seq) +{ + int i; + + if (prot->version == TLS_1_3_VERSION || + prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) { + for (i = 0; i < 8; i++) + iv[i + 4] ^= seq[i]; + } +} + +static inline void +tls_fill_prepend(struct tls_context *ctx, char *buf, size_t plaintext_len, + unsigned char record_type) +{ + struct tls_prot_info *prot = &ctx->prot_info; + size_t pkt_len, iv_size = prot->iv_size; + + pkt_len = plaintext_len + prot->tag_size; + if (prot->version != TLS_1_3_VERSION && + prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) { + pkt_len += iv_size; + + memcpy(buf + TLS_NONCE_OFFSET, + ctx->tx.iv + prot->salt_size, iv_size); + } + + /* we cover nonce explicit here as well, so buf should be of + * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE + */ + buf[0] = prot->version == TLS_1_3_VERSION ? + TLS_RECORD_TYPE_DATA : record_type; + /* Note that VERSION must be TLS_1_2 for both TLS1.2 and TLS1.3 */ + buf[1] = TLS_1_2_VERSION_MINOR; + buf[2] = TLS_1_2_VERSION_MAJOR; + /* we can use IV for nonce explicit according to spec */ + buf[3] = pkt_len >> 8; + buf[4] = pkt_len & 0xFF; +} + +static inline +void tls_make_aad(char *buf, size_t size, char *record_sequence, + unsigned char record_type, struct tls_prot_info *prot) +{ + if (prot->version != TLS_1_3_VERSION) { + memcpy(buf, record_sequence, prot->rec_seq_size); + buf += 8; + } else { + size += prot->tag_size; + } + + buf[0] = prot->version == TLS_1_3_VERSION ? + TLS_RECORD_TYPE_DATA : record_type; + buf[1] = TLS_1_2_VERSION_MAJOR; + buf[2] = TLS_1_2_VERSION_MINOR; + buf[3] = size >> 8; + buf[4] = size & 0xFF; +} + +#endif diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index ec6f4b699a2b..227b92a3064a 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -38,6 +38,7 @@ #include <net/tcp.h> #include <net/tls.h> +#include "tls.h" #include "trace.h" /* device_offload_lock is used to synchronize tls_dev_add @@ -562,7 +563,7 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) lock_sock(sk); if (unlikely(msg->msg_controllen)) { - rc = tls_proccess_cmsg(sk, msg, &record_type); + rc = tls_process_cmsg(sk, msg, &record_type); if (rc) goto out; } diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index e40bedd112b6..618cee704217 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -34,6 +34,8 @@ #include <crypto/scatterwalk.h> #include <net/ip6_checksum.h> +#include "tls.h" + static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk) { struct scatterlist *src = walk->sg; @@ -232,7 +234,7 @@ static int fill_sg_in(struct scatterlist *sg_in, s32 *sync_size, int *resync_sgs) { - int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); + int tcp_payload_offset = skb_tcp_all_headers(skb); int payload_len = skb->len - tcp_payload_offset; u32 tcp_seq = ntohl(tcp_hdr(skb)->seq); struct tls_record_info *record; @@ -310,8 +312,8 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, struct sk_buff *skb, s32 sync_size, u64 rcd_sn) { - int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); + int tcp_payload_offset = skb_tcp_all_headers(skb); int payload_len = skb->len - tcp_payload_offset; void *buf, *iv, *aad, *dummy_buf; struct aead_request *aead_req; @@ -372,7 +374,7 @@ free_nskb: static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb) { - int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); + int tcp_payload_offset = skb_tcp_all_headers(skb); struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int payload_len = skb->len - tcp_payload_offset; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index da176411c1b5..f3d9dbfa507e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -45,6 +45,8 @@ #include <net/tls.h> #include <net/tls_toe.h> +#include "tls.h" + MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); MODULE_LICENSE("Dual BSD/GPL"); @@ -164,8 +166,8 @@ static int tls_handle_open_record(struct sock *sk, int flags) return 0; } -int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, - unsigned char *record_type) +int tls_process_cmsg(struct sock *sk, struct msghdr *msg, + unsigned char *record_type) { struct cmsghdr *cmsg; int rc = -EINVAL; @@ -533,6 +535,37 @@ static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval, return 0; } +static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval, + int __user *optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + unsigned int value; + int err, len; + + if (ctx->prot_info.version != TLS_1_3_VERSION) + return -EINVAL; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < sizeof(value)) + return -EINVAL; + + lock_sock(sk); + err = -EINVAL; + if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) + value = ctx->rx_no_pad; + release_sock(sk); + if (err) + return err; + + if (put_user(sizeof(value), optlen)) + return -EFAULT; + if (copy_to_user(optval, &value, sizeof(value))) + return -EFAULT; + + return 0; +} + static int do_tls_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { @@ -547,6 +580,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname, case TLS_TX_ZEROCOPY_RO: rc = do_tls_getsockopt_tx_zc(sk, optval, optlen); break; + case TLS_RX_EXPECT_NO_PAD: + rc = do_tls_getsockopt_no_pad(sk, optval, optlen); + break; default: rc = -ENOPROTOOPT; break; @@ -718,6 +754,38 @@ static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval, return 0; } +static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval, + unsigned int optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + u32 val; + int rc; + + if (ctx->prot_info.version != TLS_1_3_VERSION || + sockptr_is_null(optval) || optlen < sizeof(val)) + return -EINVAL; + + rc = copy_from_sockptr(&val, optval, sizeof(val)); + if (rc) + return -EFAULT; + if (val > 1) + return -EINVAL; + rc = check_zeroed_sockptr(optval, sizeof(val), optlen - sizeof(val)); + if (rc < 1) + return rc == 0 ? -EINVAL : rc; + + lock_sock(sk); + rc = -EINVAL; + if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) { + ctx->rx_no_pad = val; + tls_update_rx_zc_capable(ctx); + rc = 0; + } + release_sock(sk); + + return rc; +} + static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { @@ -736,6 +804,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, rc = do_tls_setsockopt_tx_zc(sk, optval, optlen); release_sock(sk); break; + case TLS_RX_EXPECT_NO_PAD: + rc = do_tls_setsockopt_no_pad(sk, optval, optlen); + break; default: rc = -ENOPROTOOPT; break; @@ -921,6 +992,8 @@ static void tls_update(struct sock *sk, struct proto *p, { struct tls_context *ctx; + WARN_ON_ONCE(sk->sk_prot == p); + ctx = tls_get_ctx(sk); if (likely(ctx)) { ctx->sk_write_space = write_space; @@ -932,6 +1005,23 @@ static void tls_update(struct sock *sk, struct proto *p, } } +static u16 tls_user_config(struct tls_context *ctx, bool tx) +{ + u16 config = tx ? ctx->tx_conf : ctx->rx_conf; + + switch (config) { + case TLS_BASE: + return TLS_CONF_BASE; + case TLS_SW: + return TLS_CONF_SW; + case TLS_HW: + return TLS_CONF_HW; + case TLS_HW_RECORD: + return TLS_CONF_HW_RECORD; + } + return 0; +} + static int tls_get_info(const struct sock *sk, struct sk_buff *skb) { u16 version, cipher_type; @@ -974,6 +1064,11 @@ static int tls_get_info(const struct sock *sk, struct sk_buff *skb) if (err) goto nla_failure; } + if (ctx->rx_no_pad) { + err = nla_put_flag(skb, TLS_INFO_RX_NO_PAD); + if (err) + goto nla_failure; + } rcu_read_unlock(); nla_nest_end(skb, start); @@ -995,6 +1090,7 @@ static size_t tls_get_info_size(const struct sock *sk) nla_total_size(sizeof(u16)) + /* TLS_INFO_RXCONF */ nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */ nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */ + nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */ 0; return size; diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c index feeceb0e4cb4..1246e52b48f6 100644 --- a/net/tls/tls_proc.c +++ b/net/tls/tls_proc.c @@ -6,6 +6,8 @@ #include <net/snmp.h> #include <net/tls.h> +#include "tls.h" + #ifdef CONFIG_PROC_FS static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW), @@ -18,6 +20,7 @@ static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE), SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR), SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), + SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIN_TLSDECRYPTRETRY), SNMP_MIB_SENTINEL }; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0513f82b8537..09370f853031 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -44,9 +44,19 @@ #include <net/strparser.h> #include <net/tls.h> +#include "tls.h" + struct tls_decrypt_arg { bool zc; bool async; + u8 tail; +}; + +struct tls_decrypt_ctx { + u8 iv[MAX_IV_SIZE]; + u8 aad[TLS_MAX_AAD_SIZE]; + u8 tail; + struct scatterlist sg[]; }; noinline void tls_err_abort(struct sock *sk, int err) @@ -133,7 +143,8 @@ static int skb_nsg(struct sk_buff *skb, int offset, int len) return __skb_nsg(skb, offset, len, 0); } -static int padding_length(struct tls_prot_info *prot, struct sk_buff *skb) +static int tls_padding_length(struct tls_prot_info *prot, struct sk_buff *skb, + struct tls_decrypt_arg *darg) { struct strp_msg *rxm = strp_msg(skb); struct tls_msg *tlm = tls_msg(skb); @@ -142,7 +153,7 @@ static int padding_length(struct tls_prot_info *prot, struct sk_buff *skb) /* Determine zero-padding length */ if (prot->version == TLS_1_3_VERSION) { int offset = rxm->full_len - TLS_TAG_SIZE - 1; - char content_type = 0; + char content_type = darg->zc ? darg->tail : 0; int err; while (content_type == 0) { @@ -267,9 +278,6 @@ static int tls_do_decryption(struct sock *sk, } darg->async = false; - if (ret == -EBADMSG) - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR); - return ret; } @@ -518,7 +526,8 @@ static int tls_do_encryption(struct sock *sk, memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv, prot->iv_size + prot->salt_size); - xor_iv_with_seq(prot, rec->iv_data + iv_offset, tls_ctx->tx.rec_seq); + tls_xor_iv_with_seq(prot, rec->iv_data + iv_offset, + tls_ctx->tx.rec_seq); sge->offset += prot->prepend_size; sge->length -= prot->prepend_size; @@ -955,7 +964,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) lock_sock(sk); if (unlikely(msg->msg_controllen)) { - ret = tls_proccess_cmsg(sk, msg, &record_type); + ret = tls_process_cmsg(sk, msg, &record_type); if (ret) { if (ret == -EINPROGRESS) num_async++; @@ -1293,54 +1302,50 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, return ret; } -static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock, - bool nonblock, long timeo, int *err) +static int +tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, + long timeo) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct sk_buff *skb; DEFINE_WAIT_FUNC(wait, woken_wake_function); - while (!(skb = ctx->recv_pkt) && sk_psock_queue_empty(psock)) { - if (sk->sk_err) { - *err = sock_error(sk); - return NULL; - } + while (!ctx->recv_pkt) { + if (!sk_psock_queue_empty(psock)) + return 0; + + if (sk->sk_err) + return sock_error(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) { __strp_unpause(&ctx->strp); if (ctx->recv_pkt) - return ctx->recv_pkt; + break; } if (sk->sk_shutdown & RCV_SHUTDOWN) - return NULL; + return 0; if (sock_flag(sk, SOCK_DONE)) - return NULL; + return 0; - if (nonblock || !timeo) { - *err = -EAGAIN; - return NULL; - } + if (nonblock || !timeo) + return -EAGAIN; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, &timeo, - ctx->recv_pkt != skb || - !sk_psock_queue_empty(psock), + ctx->recv_pkt || !sk_psock_queue_empty(psock), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); /* Handle signals */ - if (signal_pending(current)) { - *err = sock_intr_errno(timeo); - return NULL; - } + if (signal_pending(current)) + return sock_intr_errno(timeo); } - return skb; + return 1; } static int tls_setup_from_iter(struct iov_iter *from, @@ -1415,21 +1420,22 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct tls_prot_info *prot = &tls_ctx->prot_info; + int n_sgin, n_sgout, aead_size, err, pages = 0; struct strp_msg *rxm = strp_msg(skb); struct tls_msg *tlm = tls_msg(skb); - int n_sgin, n_sgout, nsg, mem_size, aead_size, err, pages = 0; struct aead_request *aead_req; struct sk_buff *unused; - u8 *aad, *iv, *mem = NULL; struct scatterlist *sgin = NULL; struct scatterlist *sgout = NULL; - const int data_len = rxm->full_len - prot->overhead_size + - prot->tail_size; + const int data_len = rxm->full_len - prot->overhead_size; + int tail_pages = !!prot->tail_size; + struct tls_decrypt_ctx *dctx; int iv_offset = 0; + u8 *mem; if (darg->zc && (out_iov || out_sg)) { if (out_iov) - n_sgout = 1 + + n_sgout = 1 + tail_pages + iov_iter_npages_cap(out_iov, INT_MAX, data_len); else n_sgout = sg_nents(out_sg); @@ -1447,36 +1453,30 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, /* Increment to accommodate AAD */ n_sgin = n_sgin + 1; - nsg = n_sgin + n_sgout; - - aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv); - mem_size = aead_size + (nsg * sizeof(struct scatterlist)); - mem_size = mem_size + prot->aad_size; - mem_size = mem_size + MAX_IV_SIZE; - /* Allocate a single block of memory which contains - * aead_req || sgin[] || sgout[] || aad || iv. - * This order achieves correct alignment for aead_req, sgin, sgout. + * aead_req || tls_decrypt_ctx. + * Both structs are variable length. */ - mem = kmalloc(mem_size, sk->sk_allocation); + aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv); + mem = kmalloc(aead_size + struct_size(dctx, sg, n_sgin + n_sgout), + sk->sk_allocation); if (!mem) return -ENOMEM; /* Segment the allocated memory */ aead_req = (struct aead_request *)mem; - sgin = (struct scatterlist *)(mem + aead_size); - sgout = sgin + n_sgin; - aad = (u8 *)(sgout + n_sgout); - iv = aad + prot->aad_size; + dctx = (struct tls_decrypt_ctx *)(mem + aead_size); + sgin = &dctx->sg[0]; + sgout = &dctx->sg[n_sgin]; /* For CCM based ciphers, first byte of nonce+iv is a constant */ switch (prot->cipher_type) { case TLS_CIPHER_AES_CCM_128: - iv[0] = TLS_AES_CCM_IV_B0_BYTE; + dctx->iv[0] = TLS_AES_CCM_IV_B0_BYTE; iv_offset = 1; break; case TLS_CIPHER_SM4_CCM: - iv[0] = TLS_SM4_CCM_IV_B0_BYTE; + dctx->iv[0] = TLS_SM4_CCM_IV_B0_BYTE; iv_offset = 1; break; } @@ -1484,46 +1484,49 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, /* Prepare IV */ if (prot->version == TLS_1_3_VERSION || prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) { - memcpy(iv + iv_offset, tls_ctx->rx.iv, + memcpy(&dctx->iv[iv_offset], tls_ctx->rx.iv, prot->iv_size + prot->salt_size); } else { err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, - iv + iv_offset + prot->salt_size, + &dctx->iv[iv_offset] + prot->salt_size, prot->iv_size); - if (err < 0) { - kfree(mem); - return err; - } - memcpy(iv + iv_offset, tls_ctx->rx.iv, prot->salt_size); + if (err < 0) + goto exit_free; + memcpy(&dctx->iv[iv_offset], tls_ctx->rx.iv, prot->salt_size); } - xor_iv_with_seq(prot, iv + iv_offset, tls_ctx->rx.rec_seq); + tls_xor_iv_with_seq(prot, &dctx->iv[iv_offset], tls_ctx->rx.rec_seq); /* Prepare AAD */ - tls_make_aad(aad, rxm->full_len - prot->overhead_size + + tls_make_aad(dctx->aad, rxm->full_len - prot->overhead_size + prot->tail_size, tls_ctx->rx.rec_seq, tlm->control, prot); /* Prepare sgin */ sg_init_table(sgin, n_sgin); - sg_set_buf(&sgin[0], aad, prot->aad_size); + sg_set_buf(&sgin[0], dctx->aad, prot->aad_size); err = skb_to_sgvec(skb, &sgin[1], rxm->offset + prot->prepend_size, rxm->full_len - prot->prepend_size); - if (err < 0) { - kfree(mem); - return err; - } + if (err < 0) + goto exit_free; if (n_sgout) { if (out_iov) { sg_init_table(sgout, n_sgout); - sg_set_buf(&sgout[0], aad, prot->aad_size); + sg_set_buf(&sgout[0], dctx->aad, prot->aad_size); err = tls_setup_from_iter(out_iov, data_len, &pages, &sgout[1], - (n_sgout - 1)); + (n_sgout - 1 - tail_pages)); if (err < 0) goto fallback_to_reg_recv; + + if (prot->tail_size) { + sg_unmark_end(&sgout[pages]); + sg_set_buf(&sgout[pages + 1], &dctx->tail, + prot->tail_size); + sg_mark_end(&sgout[pages + 1]); + } } else if (out_sg) { memcpy(sgout, out_sg, n_sgout * sizeof(*sgout)); } else { @@ -1537,15 +1540,18 @@ fallback_to_reg_recv: } /* Prepare and submit AEAD request */ - err = tls_do_decryption(sk, skb, sgin, sgout, iv, - data_len, aead_req, darg); + err = tls_do_decryption(sk, skb, sgin, sgout, dctx->iv, + data_len + prot->tail_size, aead_req, darg); if (darg->async) return 0; + if (prot->tail_size) + darg->tail = dctx->tail; + /* Release the pages in case iov was mapped to pages */ for (; pages > 0; pages--) put_page(sg_page(&sgout[pages])); - +exit_free: kfree(mem); return err; } @@ -1579,13 +1585,23 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, } err = decrypt_internal(sk, skb, dest, NULL, darg); - if (err < 0) + if (err < 0) { + if (err == -EBADMSG) + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR); return err; + } if (darg->async) goto decrypt_next; + /* If opportunistic TLS 1.3 ZC failed retry without ZC */ + if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION && + darg->tail != TLS_RECORD_TYPE_DATA)) { + darg->zc = false; + TLS_INC_STATS(sock_net(sk), LINUX_MIN_TLSDECRYPTRETRY); + return decrypt_skb_update(sk, skb, dest, darg); + } decrypt_done: - pad = padding_length(prot, skb); + pad = tls_padding_length(prot, skb, darg); if (pad < 0) return pad; @@ -1717,6 +1733,24 @@ out: return copied ? : err; } +static void +tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot, + size_t len_left, size_t decrypted, ssize_t done, + size_t *flushed_at) +{ + size_t max_rec; + + if (len_left <= decrypted) + return; + + max_rec = prot->overhead_size - prot->tail_size + TLS_MAX_PAYLOAD_SIZE; + if (done - *flushed_at < SZ_128K && tcp_inq(sk) > max_rec) + return; + + *flushed_at = done; + sk_flush_backlog(sk); +} + int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -1729,6 +1763,7 @@ int tls_sw_recvmsg(struct sock *sk, struct sk_psock *psock; unsigned char control = 0; ssize_t decrypted = 0; + size_t flushed_at = 0; struct strp_msg *rxm; struct tls_msg *tlm; struct sk_buff *skb; @@ -1767,14 +1802,14 @@ int tls_sw_recvmsg(struct sock *sk, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); zc_capable = !bpf_strp_enabled && !is_kvec && !is_peek && - prot->version != TLS_1_3_VERSION; + ctx->zc_capable; decrypted = 0; while (len && (decrypted + copied < target || ctx->recv_pkt)) { struct tls_decrypt_arg darg = {}; int to_decrypt, chunk; - skb = tls_wait_data(sk, psock, flags & MSG_DONTWAIT, timeo, &err); - if (!skb) { + err = tls_rx_rec_wait(sk, psock, flags & MSG_DONTWAIT, timeo); + if (err <= 0) { if (psock) { chunk = sk_msg_recvmsg(sk, psock, msg, len, flags); @@ -1784,6 +1819,7 @@ int tls_sw_recvmsg(struct sock *sk, goto recv_end; } + skb = ctx->recv_pkt; rxm = strp_msg(skb); tlm = tls_msg(skb); @@ -1818,6 +1854,10 @@ int tls_sw_recvmsg(struct sock *sk, if (err <= 0) goto recv_end; + /* periodically flush backlog, and feed strparser */ + tls_read_flush_backlog(sk, prot, len, to_decrypt, + decrypted + copied, &flushed_at); + ctx->recv_pkt = NULL; __strp_unpause(&ctx->strp); __skb_queue_tail(&ctx->rx_list, skb); @@ -1946,11 +1986,13 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } else { struct tls_decrypt_arg darg = {}; - skb = tls_wait_data(sk, NULL, flags & SPLICE_F_NONBLOCK, timeo, - &err); - if (!skb) + err = tls_rx_rec_wait(sk, NULL, flags & SPLICE_F_NONBLOCK, + timeo); + if (err <= 0) goto splice_read_end; + skb = ctx->recv_pkt; + err = decrypt_skb_update(sk, skb, NULL, &darg); if (err < 0) { tls_err_abort(sk, -EBADMSG); @@ -2227,12 +2269,23 @@ static void tx_work_handler(struct work_struct *work) mutex_unlock(&tls_ctx->tx_lock); } +static bool tls_is_tx_ready(struct tls_sw_context_tx *ctx) +{ + struct tls_rec *rec; + + rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); + if (!rec) + return false; + + return READ_ONCE(rec->tx_ready); +} + void tls_sw_write_space(struct sock *sk, struct tls_context *ctx) { struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx); /* Schedule the transmission if tx list is ready */ - if (is_tx_ready(tx_ctx) && + if (tls_is_tx_ready(tx_ctx) && !test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask)) schedule_delayed_work(&tx_ctx->tx_work.work, 0); } @@ -2249,6 +2302,14 @@ void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx) strp_check_rcv(&rx_ctx->strp); } +void tls_update_rx_zc_capable(struct tls_context *tls_ctx) +{ + struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(tls_ctx); + + rx_ctx->zc_capable = tls_ctx->rx_no_pad || + tls_ctx->prot_info.version != TLS_1_3_VERSION; +} + int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { struct tls_context *tls_ctx = tls_get_ctx(sk); @@ -2422,13 +2483,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_priv; } - /* Sanity-check the sizes for stack allocations. */ - if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE || - rec_seq_size > TLS_MAX_REC_SEQ_SIZE || tag_size != TLS_TAG_SIZE) { - rc = -EINVAL; - goto free_priv; - } - if (crypto_info->version == TLS_1_3_VERSION) { nonce_size = 0; prot->aad_size = TLS_HEADER_SIZE; @@ -2438,6 +2492,14 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) prot->tail_size = 0; } + /* Sanity-check the sizes for stack allocations. */ + if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE || + rec_seq_size > TLS_MAX_REC_SEQ_SIZE || tag_size != TLS_TAG_SIZE || + prot->aad_size > TLS_MAX_AAD_SIZE) { + rc = -EINVAL; + goto free_priv; + } + prot->version = crypto_info->version; prot->cipher_type = crypto_info->cipher_type; prot->prepend_size = TLS_HEADER_SIZE + nonce_size; @@ -2484,12 +2546,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) if (sw_ctx_rx) { tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); - if (crypto_info->version == TLS_1_3_VERSION) - sw_ctx_rx->async_capable = 0; - else - sw_ctx_rx->async_capable = - !!(tfm->__crt_alg->cra_flags & - CRYPTO_ALG_ASYNC); + tls_update_rx_zc_capable(ctx); + sw_ctx_rx->async_capable = + crypto_info->version != TLS_1_3_VERSION && + !!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC); /* Set up strparser */ memset(&cb, 0, sizeof(cb)); diff --git a/net/tls/tls_toe.c b/net/tls/tls_toe.c index 7e1330f19165..825669e1ab47 100644 --- a/net/tls/tls_toe.c +++ b/net/tls/tls_toe.c @@ -38,6 +38,8 @@ #include <net/tls.h> #include <net/tls_toe.h> +#include "tls.h" + static LIST_HEAD(device_list); static DEFINE_SPINLOCK(device_spinlock); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1bed3739768c..bf338b782fc4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,15 +118,13 @@ #include "scm.h" -spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE]; -EXPORT_SYMBOL_GPL(unix_table_locks); -struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; -EXPORT_SYMBOL_GPL(unix_socket_table); static atomic_long_t unix_nr_socks; +static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; +static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; /* SMP locking strategy: - * hash table is protected with spinlock unix_table_locks - * each socket state is protected by separate spin lock. + * hash table is protected with spinlock. + * each socket state is protected by separate spinlock. */ static unsigned int unix_unbound_hash(struct sock *sk) @@ -137,12 +135,12 @@ static unsigned int unix_unbound_hash(struct sock *sk) hash ^= hash >> 8; hash ^= sk->sk_type; - return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1)); + return hash & UNIX_HASH_MOD; } static unsigned int unix_bsd_hash(struct inode *i) { - return i->i_ino & (UNIX_HASH_SIZE - 1); + return i->i_ino & UNIX_HASH_MOD; } static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, @@ -155,26 +153,34 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, hash ^= hash >> 8; hash ^= type; - return hash & (UNIX_HASH_SIZE - 1); + return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); } -static void unix_table_double_lock(unsigned int hash1, unsigned int hash2) +static void unix_table_double_lock(struct net *net, + unsigned int hash1, unsigned int hash2) { - /* hash1 and hash2 is never the same because - * one is between 0 and UNIX_HASH_SIZE - 1, and - * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2. - */ + if (hash1 == hash2) { + spin_lock(&net->unx.table.locks[hash1]); + return; + } + if (hash1 > hash2) swap(hash1, hash2); - spin_lock(&unix_table_locks[hash1]); - spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING); + spin_lock(&net->unx.table.locks[hash1]); + spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); } -static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2) +static void unix_table_double_unlock(struct net *net, + unsigned int hash1, unsigned int hash2) { - spin_unlock(&unix_table_locks[hash1]); - spin_unlock(&unix_table_locks[hash2]); + if (hash1 == hash2) { + spin_unlock(&net->unx.table.locks[hash1]); + return; + } + + spin_unlock(&net->unx.table.locks[hash1]); + spin_unlock(&net->unx.table.locks[hash2]); } #ifdef CONFIG_SECURITY_NETWORK @@ -300,34 +306,52 @@ static void __unix_remove_socket(struct sock *sk) sk_del_node_init(sk); } -static void __unix_insert_socket(struct sock *sk) +static void __unix_insert_socket(struct net *net, struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); - sk_add_node(sk, &unix_socket_table[sk->sk_hash]); + sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); } -static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr, - unsigned int hash) +static void __unix_set_addr_hash(struct net *net, struct sock *sk, + struct unix_address *addr, unsigned int hash) { __unix_remove_socket(sk); smp_store_release(&unix_sk(sk)->addr, addr); sk->sk_hash = hash; - __unix_insert_socket(sk); + __unix_insert_socket(net, sk); } -static void unix_remove_socket(struct sock *sk) +static void unix_remove_socket(struct net *net, struct sock *sk) { - spin_lock(&unix_table_locks[sk->sk_hash]); + spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_remove_socket(sk); - spin_unlock(&unix_table_locks[sk->sk_hash]); + spin_unlock(&net->unx.table.locks[sk->sk_hash]); +} + +static void unix_insert_unbound_socket(struct net *net, struct sock *sk) +{ + spin_lock(&net->unx.table.locks[sk->sk_hash]); + __unix_insert_socket(net, sk); + spin_unlock(&net->unx.table.locks[sk->sk_hash]); +} + +static void unix_insert_bsd_socket(struct sock *sk) +{ + spin_lock(&bsd_socket_locks[sk->sk_hash]); + sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); + spin_unlock(&bsd_socket_locks[sk->sk_hash]); } -static void unix_insert_unbound_socket(struct sock *sk) +static void unix_remove_bsd_socket(struct sock *sk) { - spin_lock(&unix_table_locks[sk->sk_hash]); - __unix_insert_socket(sk); - spin_unlock(&unix_table_locks[sk->sk_hash]); + if (!hlist_unhashed(&sk->sk_bind_node)) { + spin_lock(&bsd_socket_locks[sk->sk_hash]); + __sk_del_bind_node(sk); + spin_unlock(&bsd_socket_locks[sk->sk_hash]); + + sk_node_init(&sk->sk_bind_node); + } } static struct sock *__unix_find_socket_byname(struct net *net, @@ -336,12 +360,9 @@ static struct sock *__unix_find_socket_byname(struct net *net, { struct sock *s; - sk_for_each(s, &unix_socket_table[hash]) { + sk_for_each(s, &net->unx.table.buckets[hash]) { struct unix_sock *u = unix_sk(s); - if (!net_eq(sock_net(s), net)) - continue; - if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) return s; @@ -355,11 +376,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net, { struct sock *s; - spin_lock(&unix_table_locks[hash]); + spin_lock(&net->unx.table.locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); - spin_unlock(&unix_table_locks[hash]); + spin_unlock(&net->unx.table.locks[hash]); return s; } @@ -368,17 +389,17 @@ static struct sock *unix_find_socket_byinode(struct inode *i) unsigned int hash = unix_bsd_hash(i); struct sock *s; - spin_lock(&unix_table_locks[hash]); - sk_for_each(s, &unix_socket_table[hash]) { + spin_lock(&bsd_socket_locks[hash]); + sk_for_each_bound(s, &bsd_socket_buckets[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); - spin_unlock(&unix_table_locks[hash]); + spin_unlock(&bsd_socket_locks[hash]); return s; } } - spin_unlock(&unix_table_locks[hash]); + spin_unlock(&bsd_socket_locks[hash]); return NULL; } @@ -576,12 +597,13 @@ static void unix_sock_destructor(struct sock *sk) static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); - struct path path; struct sock *skpair; struct sk_buff *skb; + struct path path; int state; - unix_remove_socket(sk); + unix_remove_socket(sock_net(sk), sk); + unix_remove_bsd_socket(sk); /* Clear state */ unix_state_lock(sk); @@ -928,9 +950,9 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); - unix_insert_unbound_socket(sk); + unix_insert_unbound_socket(net, sk); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + sock_prot_inuse_add(net, sk->sk_prot, 1); return sk; @@ -991,8 +1013,8 @@ static int unix_release(struct socket *sock) return 0; } -static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, - int addr_len, int type) +static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, + int type) { struct inode *inode; struct path path; @@ -1061,7 +1083,7 @@ static struct sock *unix_find_other(struct net *net, struct sock *sk; if (sunaddr->sun_path[0]) - sk = unix_find_bsd(net, sunaddr, addr_len, type); + sk = unix_find_bsd(sunaddr, addr_len, type); else sk = unix_find_abstract(net, sunaddr, addr_len, type); @@ -1072,6 +1094,7 @@ static int unix_autobind(struct sock *sk) { unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + struct net *net = sock_net(sk); struct unix_address *addr; u32 lastnum, ordernum; int err; @@ -1100,11 +1123,10 @@ retry: sprintf(addr->name->sun_path + 1, "%05x", ordernum); new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - unix_table_double_lock(old_hash, new_hash); + unix_table_double_lock(net, old_hash, new_hash); - if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - new_hash)) { - unix_table_double_unlock(old_hash, new_hash); + if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { + unix_table_double_unlock(net, old_hash, new_hash); /* __unix_find_socket_byname() may take long time if many names * are already in use. @@ -1121,8 +1143,8 @@ retry: goto retry; } - __unix_set_addr_hash(sk, addr, new_hash); - unix_table_double_unlock(old_hash, new_hash); + __unix_set_addr_hash(net, sk, addr, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); err = 0; out: mutex_unlock(&u->bindlock); @@ -1136,6 +1158,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + struct net *net = sock_net(sk); struct user_namespace *ns; // barf... struct unix_address *addr; struct dentry *dentry; @@ -1176,11 +1199,12 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, goto out_unlock; new_hash = unix_bsd_hash(d_backing_inode(dentry)); - unix_table_double_lock(old_hash, new_hash); + unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); - __unix_set_addr_hash(sk, addr, new_hash); - unix_table_double_unlock(old_hash, new_hash); + __unix_set_addr_hash(net, sk, addr, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); + unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; @@ -1203,6 +1227,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, { unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + struct net *net = sock_net(sk); struct unix_address *addr; int err; @@ -1220,19 +1245,18 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, } new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - unix_table_double_lock(old_hash, new_hash); + unix_table_double_lock(net, old_hash, new_hash); - if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - new_hash)) + if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) goto out_spin; - __unix_set_addr_hash(sk, addr, new_hash); - unix_table_double_unlock(old_hash, new_hash); + __unix_set_addr_hash(net, sk, addr, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); return 0; out_spin: - unix_table_double_unlock(old_hash, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); err = -EADDRINUSE; out_mutex: mutex_unlock(&u->bindlock); @@ -1291,9 +1315,8 @@ static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; + struct sock *sk = sock->sk; struct sock *other; int err; @@ -1314,7 +1337,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, } restart: - other = unix_find_other(net, sunaddr, alen, sock->type); + other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; @@ -1402,15 +1425,13 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); + struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; - struct sock *newsk = NULL; - struct sock *other = NULL; + struct net *net = sock_net(sk); struct sk_buff *skb = NULL; - int st; - int err; long timeo; + int err; + int st; err = unix_validate_addr(sunaddr, addr_len); if (err) @@ -1430,7 +1451,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, */ /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); + newsk = unix_create1(net, NULL, 0, sock->type); if (IS_ERR(newsk)) { err = PTR_ERR(newsk); newsk = NULL; @@ -1539,9 +1560,9 @@ restart: * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found - * otheru in hash under unix_table_locks. Insertion - * into the hash chain we'd found it in had been done - * in an earlier critical area protected by unix_table_locks, + * otheru in hash under its lock. Insertion into the + * hash chain we'd found it in had been done in an + * earlier critical area protected by the chain's lock, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * @@ -1838,17 +1859,15 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct unix_sock *u = unix_sk(sk); DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); - struct sock *other = NULL; - int err; - struct sk_buff *skb; - long timeo; + struct sock *sk = sock->sk, *other = NULL; + struct unix_sock *u = unix_sk(sk); struct scm_cookie scm; + struct sk_buff *skb; int data_len = 0; int sk_locked; + long timeo; + int err; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); @@ -1915,7 +1934,7 @@ restart: if (sunaddr == NULL) goto out_free; - other = unix_find_other(net, sunaddr, msg->msg_namelen, + other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); @@ -3221,12 +3240,11 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { unsigned long offset = get_offset(*pos); unsigned long bucket = get_bucket(*pos); - struct sock *sk; unsigned long count = 0; + struct sock *sk; - for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) { - if (sock_net(sk) != seq_file_net(seq)) - continue; + for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); + sk; sk = sk_next(sk)) { if (++count == offset) break; } @@ -3237,16 +3255,17 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) { unsigned long bucket = get_bucket(*pos); + struct net *net = seq_file_net(seq); struct sock *sk; - while (bucket < ARRAY_SIZE(unix_socket_table)) { - spin_lock(&unix_table_locks[bucket]); + while (bucket < UNIX_HASH_SIZE) { + spin_lock(&net->unx.table.locks[bucket]); sk = unix_from_bucket(seq, pos); if (sk) return sk; - spin_unlock(&unix_table_locks[bucket]); + spin_unlock(&net->unx.table.locks[bucket]); *pos = set_bucket_offset(++bucket, 1); } @@ -3259,11 +3278,12 @@ static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, { unsigned long bucket = get_bucket(*pos); - for (sk = sk_next(sk); sk; sk = sk_next(sk)) - if (sock_net(sk) == seq_file_net(seq)) - return sk; + sk = sk_next(sk); + if (sk) + return sk; + - spin_unlock(&unix_table_locks[bucket]); + spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); *pos = set_bucket_offset(++bucket, 1); @@ -3293,7 +3313,7 @@ static void unix_seq_stop(struct seq_file *seq, void *v) struct sock *sk = v; if (sk) - spin_unlock(&unix_table_locks[sk->sk_hash]); + spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); } static int unix_seq_show(struct seq_file *seq, void *v) @@ -3318,7 +3338,7 @@ static int unix_seq_show(struct seq_file *seq, void *v) (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); - if (u->addr) { // under unix_table_locks here + if (u->addr) { // under a hash table lock here int i, len; seq_putc(seq, ' '); @@ -3388,9 +3408,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) iter->batch[iter->end_sk++] = start_sk; for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { - if (sock_net(sk) != seq_file_net(seq)) - continue; - if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++] = sk; @@ -3399,7 +3416,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) expected++; } - spin_unlock(&unix_table_locks[start_sk->sk_hash]); + spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); return expected; } @@ -3559,7 +3576,7 @@ static const struct net_proto_family unix_family_ops = { static int __net_init unix_net_init(struct net *net) { - int error = -ENOMEM; + int i; net->unx.sysctl_max_dgram_qlen = 10; if (unix_sysctl_register(net)) @@ -3567,18 +3584,44 @@ static int __net_init unix_net_init(struct net *net) #ifdef CONFIG_PROC_FS if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, - sizeof(struct seq_net_private))) { - unix_sysctl_unregister(net); - goto out; + sizeof(struct seq_net_private))) + goto err_sysctl; +#endif + + net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, + sizeof(spinlock_t), GFP_KERNEL); + if (!net->unx.table.locks) + goto err_proc; + + net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!net->unx.table.buckets) + goto free_locks; + + for (i = 0; i < UNIX_HASH_SIZE; i++) { + spin_lock_init(&net->unx.table.locks[i]); + INIT_HLIST_HEAD(&net->unx.table.buckets[i]); } + + return 0; + +free_locks: + kvfree(net->unx.table.locks); +err_proc: +#ifdef CONFIG_PROC_FS + remove_proc_entry("unix", net->proc_net); +err_sysctl: #endif - error = 0; + unix_sysctl_unregister(net); out: - return error; + return -ENOMEM; } static void __net_exit unix_net_exit(struct net *net) { + kvfree(net->unx.table.buckets); + kvfree(net->unx.table.locks); unix_sysctl_unregister(net); remove_proc_entry("unix", net->proc_net); } @@ -3666,8 +3709,10 @@ static int __init af_unix_init(void) BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); - for (i = 0; i < 2 * UNIX_HASH_SIZE; i++) - spin_lock_init(&unix_table_locks[i]); + for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { + spin_lock_init(&bsd_socket_locks[i]); + INIT_HLIST_HEAD(&bsd_socket_buckets[i]); + } rc = proto_register(&unix_dgram_proto, 1); if (rc != 0) { diff --git a/net/unix/diag.c b/net/unix/diag.c index bb0b5ea1655f..105f522a89fe 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -13,7 +13,7 @@ static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { - /* might or might not have unix_table_locks */ + /* might or might not have a hash table lock */ struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) @@ -195,25 +195,21 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_r static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct unix_diag_req *req; - int num, s_num, slot, s_slot; struct net *net = sock_net(skb->sk); + int num, s_num, slot, s_slot; + struct unix_diag_req *req; req = nlmsg_data(cb->nlh); s_slot = cb->args[0]; num = s_num = cb->args[1]; - for (slot = s_slot; - slot < ARRAY_SIZE(unix_socket_table); - s_num = 0, slot++) { + for (slot = s_slot; slot < UNIX_HASH_SIZE; s_num = 0, slot++) { struct sock *sk; num = 0; - spin_lock(&unix_table_locks[slot]); - sk_for_each(sk, &unix_socket_table[slot]) { - if (!net_eq(sock_net(sk), net)) - continue; + spin_lock(&net->unx.table.locks[slot]); + sk_for_each(sk, &net->unx.table.buckets[slot]) { if (num < s_num) goto next; if (!(req->udiag_states & (1 << sk->sk_state))) @@ -222,13 +218,13 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) { - spin_unlock(&unix_table_locks[slot]); + spin_unlock(&net->unx.table.locks[slot]); goto done; } next: num++; } - spin_unlock(&unix_table_locks[slot]); + spin_unlock(&net->unx.table.locks[slot]); } done: cb->args[0] = slot; @@ -237,20 +233,21 @@ done: return skb->len; } -static struct sock *unix_lookup_by_ino(unsigned int ino) +static struct sock *unix_lookup_by_ino(struct net *net, unsigned int ino) { struct sock *sk; int i; - for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) { - spin_lock(&unix_table_locks[i]); - sk_for_each(sk, &unix_socket_table[i]) + for (i = 0; i < UNIX_HASH_SIZE; i++) { + spin_lock(&net->unx.table.locks[i]); + sk_for_each(sk, &net->unx.table.buckets[i]) { if (ino == sock_i_ino(sk)) { sock_hold(sk); - spin_unlock(&unix_table_locks[i]); + spin_unlock(&net->unx.table.locks[i]); return sk; } - spin_unlock(&unix_table_locks[i]); + } + spin_unlock(&net->unx.table.locks[i]); } return NULL; } @@ -259,21 +256,20 @@ static int unix_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh, struct unix_diag_req *req) { - int err = -EINVAL; - struct sock *sk; - struct sk_buff *rep; - unsigned int extra_len; struct net *net = sock_net(in_skb->sk); + unsigned int extra_len; + struct sk_buff *rep; + struct sock *sk; + int err; + err = -EINVAL; if (req->udiag_ino == 0) goto out_nosk; - sk = unix_lookup_by_ino(req->udiag_ino); + sk = unix_lookup_by_ino(net, req->udiag_ino); err = -ENOENT; if (sk == NULL) goto out_nosk; - if (!net_eq(sock_net(sk), net)) - goto out; err = sock_diag_check_cookie(sk, req->udiag_cookie); if (err) @@ -308,7 +304,6 @@ out_nosk: static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct unix_diag_req); - struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; @@ -317,7 +312,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) struct netlink_dump_control c = { .dump = unix_diag_dump, }; - return netlink_dump_start(net->diag_nlsk, skb, h, &c); + return netlink_dump_start(sock_net(skb->sk)->diag_nlsk, skb, h, &c); } else return unix_diag_get_exact(skb, h, nlmsg_data(h)); } diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 01d44e2598e2..500129aa710c 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -26,11 +26,16 @@ int __net_init unix_sysctl_register(struct net *net) { struct ctl_table *table; - table = kmemdup(unix_table, sizeof(unix_table), GFP_KERNEL); - if (table == NULL) - goto err_alloc; + if (net_eq(net, &init_net)) { + table = unix_table; + } else { + table = kmemdup(unix_table, sizeof(unix_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + table[0].data = &net->unx.sysctl_max_dgram_qlen; + } - table[0].data = &net->unx.sysctl_max_dgram_qlen; net->unx.ctl = register_net_sysctl(net, "net/unix", table); if (net->unx.ctl == NULL) goto err_reg; @@ -38,7 +43,8 @@ int __net_init unix_sysctl_register(struct net *net) return 0; err_reg: - kfree(table); + if (!net_eq(net, &init_net)) + kfree(table); err_alloc: return -ENOMEM; } @@ -49,5 +55,6 @@ void unix_sysctl_unregister(struct net *net) table = net->unx.ctl->ctl_table_arg; unregister_net_sysctl_table(net->unx.ctl); - kfree(table); + if (!net_eq(net, &init_net)) + kfree(table); } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 19ac872a6624..09002387987e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -538,12 +538,6 @@ static int xsk_generic_xmit(struct sock *sk) goto out; } - skb = xsk_build_skb(xs, &desc); - if (IS_ERR(skb)) { - err = PTR_ERR(skb); - goto out; - } - /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement @@ -552,11 +546,19 @@ static int xsk_generic_xmit(struct sock *sk) spin_lock_irqsave(&xs->pool->cq_lock, flags); if (xskq_prod_reserve(xs->pool->cq)) { spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - kfree_skb(skb); goto out; } spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + skb = xsk_build_skb(xs, &desc); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel(xs->pool->cq); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + goto out; + } + err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 87bdd71c7bb6..f70112176b7c 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -332,6 +332,7 @@ static void __xp_dma_unmap(struct xsk_dma_map *dma_map, unsigned long attrs) for (i = 0; i < dma_map->dma_pages_cnt; i++) { dma = &dma_map->dma_pages[i]; if (*dma) { + *dma &= ~XSK_NEXT_PG_CONTIG_MASK; dma_unmap_page_attrs(dma_map->dev, *dma, PAGE_SIZE, DMA_BIDIRECTIONAL, attrs); *dma = 0; |
