aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/bluetooth/hci_core.c3
-rw-r--r--net/bluetooth/hci_sync.c1
-rw-r--r--net/bridge/br_netfilter_hooks.c21
-rw-r--r--net/can/Kconfig5
-rw-r--r--net/can/bcm.c18
-rw-r--r--net/core/dev.c25
-rw-r--r--net/core/filter.c34
-rw-r--r--net/core/neighbour.c32
-rw-r--r--net/core/net-sysfs.c1
-rw-r--r--net/core/page_pool.c3
-rw-r--r--net/core/skbuff.c16
-rw-r--r--net/core/skmsg.c5
-rw-r--r--net/core/sock.c1
-rw-r--r--net/decnet/dn_neigh.c1
-rw-r--r--net/dsa/Kconfig11
-rw-r--r--net/dsa/Makefile1
-rw-r--r--net/dsa/slave.c29
-rw-r--r--net/dsa/tag_ksz.c59
-rw-r--r--net/dsa/tag_rzn1_a5psw.c113
-rw-r--r--net/ethtool/eeprom.c2
-rw-r--r--net/ipv4/arp.c1
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/ip_gre.c15
-rw-r--r--net/ipv4/ip_output.c8
-rw-r--r--net/ipv4/ip_tunnel_core.c2
-rw-r--r--net/ipv4/ipconfig.c8
-rw-r--r--net/ipv4/ipmr.c215
-rw-r--r--net/ipv4/ipmr_base.c53
-rw-r--r--net/ipv4/ping.c10
-rw-r--r--net/ipv4/raw.c10
-rw-r--r--net/ipv4/raw_diag.c4
-rw-r--r--net/ipv4/tcp.c26
-rw-r--r--net/ipv4/tcp_bpf.c3
-rw-r--r--net/ipv4/tcp_ipv4.c6
-rw-r--r--net/ipv6/addrconf.c70
-rw-r--r--net/ipv6/ip6_gre.c15
-rw-r--r--net/ipv6/ip6mr.c202
-rw-r--r--net/ipv6/ndisc.c1
-rw-r--r--net/ipv6/raw.c3
-rw-r--r--net/ipv6/route.c11
-rw-r--r--net/ipv6/seg6_hmac.c1
-rw-r--r--net/ipv6/sit.c8
-rw-r--r--net/l2tp/l2tp_debugfs.c6
-rw-r--r--net/l2tp/l2tp_ppp.c2
-rw-r--r--net/mptcp/options.c10
-rw-r--r--net/mptcp/pm.c10
-rw-r--r--net/mptcp/pm_netlink.c48
-rw-r--r--net/mptcp/pm_userspace.c51
-rw-r--r--net/mptcp/protocol.c142
-rw-r--r--net/mptcp/protocol.h34
-rw-r--r--net/mptcp/subflow.c129
-rw-r--r--net/ncsi/ncsi-manage.c3
-rw-r--r--net/netfilter/nf_dup_netdev.c25
-rw-r--r--net/netfilter/nf_tables_api.c9
-rw-r--r--net/netfilter/nf_tables_core.c24
-rw-r--r--net/netfilter/nf_tables_trace.c44
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c2
-rw-r--r--net/netfilter/nft_meta.c13
-rw-r--r--net/netfilter/nft_numgen.c12
-rw-r--r--net/netfilter/nft_set_hash.c2
-rw-r--r--net/netfilter/nft_set_pipapo.c48
-rw-r--r--net/openvswitch/flow.c2
-rw-r--r--net/rose/rose_route.c4
-rw-r--r--net/rose/rose_timer.c34
-rw-r--r--net/rxrpc/rxkad.c2
-rw-r--r--net/sched/act_api.c22
-rw-r--r--net/sched/act_police.c2
-rw-r--r--net/sched/sch_netem.c4
-rw-r--r--net/sched/sch_taprio.c5
-rw-r--r--net/socket.c16
-rw-r--r--net/strparser/strparser.c3
-rw-r--r--net/sunrpc/clnt.c1
-rw-r--r--net/sunrpc/xdr.c2
-rw-r--r--net/tipc/core.c3
-rw-r--r--net/tipc/node.c41
-rw-r--r--net/tipc/socket.c1
-rw-r--r--net/tls/tls.h290
-rw-r--r--net/tls/tls_device.c3
-rw-r--r--net/tls/tls_device_fallback.c8
-rw-r--r--net/tls/tls_main.c100
-rw-r--r--net/tls/tls_proc.c3
-rw-r--r--net/tls/tls_sw.c242
-rw-r--r--net/tls/tls_toe.c2
-rw-r--r--net/unix/af_unix.c263
-rw-r--r--net/unix/diag.c49
-rw-r--r--net/unix/sysctl_net_unix.c19
-rw-r--r--net/xdp/xsk.c16
-rw-r--r--net/xdp/xsk_buff_pool.c1
88 files changed, 1919 insertions, 890 deletions
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 59a5c1341c26..a0f99baafd35 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -571,6 +571,7 @@ int hci_dev_close(__u16 dev)
goto done;
}
+ cancel_work_sync(&hdev->power_on);
if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF))
cancel_delayed_work(&hdev->power_off);
@@ -2675,6 +2676,8 @@ void hci_unregister_dev(struct hci_dev *hdev)
list_del(&hdev->list);
write_unlock(&hci_dev_list_lock);
+ cancel_work_sync(&hdev->power_on);
+
hci_cmd_sync_clear(hdev);
if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks))
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 286d6767f017..1739e8cb3291 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4088,7 +4088,6 @@ int hci_dev_close_sync(struct hci_dev *hdev)
bt_dev_dbg(hdev, "");
- cancel_work_sync(&hdev->power_on);
cancel_delayed_work(&hdev->power_off);
cancel_delayed_work(&hdev->ncmd_timer);
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 4fd882686b04..ff4779036649 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1012,9 +1012,24 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
return okfn(net, sk, skb);
ops = nf_hook_entries_get_hook_ops(e);
- for (i = 0; i < e->num_hook_entries &&
- ops[i]->priority <= NF_BR_PRI_BRNF; i++)
- ;
+ for (i = 0; i < e->num_hook_entries; i++) {
+ /* These hooks have already been called */
+ if (ops[i]->priority < NF_BR_PRI_BRNF)
+ continue;
+
+ /* These hooks have not been called yet, run them. */
+ if (ops[i]->priority > NF_BR_PRI_BRNF)
+ break;
+
+ /* take a closer look at NF_BR_PRI_BRNF. */
+ if (ops[i]->hook == br_nf_pre_routing) {
+ /* This hook diverted the skb to this function,
+ * hooks after this have not been run yet.
+ */
+ i++;
+ break;
+ }
+ }
nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev,
sk, net, okfn);
diff --git a/net/can/Kconfig b/net/can/Kconfig
index a9ac5ffab286..cb56be8e3862 100644
--- a/net/can/Kconfig
+++ b/net/can/Kconfig
@@ -15,7 +15,8 @@ menuconfig CAN
PF_CAN is contained in <Documentation/networking/can.rst>.
If you want CAN support you should say Y here and also to the
- specific driver for your controller(s) below.
+ specific driver for your controller(s) under the Network device
+ support section.
if CAN
@@ -69,6 +70,4 @@ config CAN_ISOTP
If you want to perform automotive vehicle diagnostic services (UDS),
say 'y'.
-source "drivers/net/can/Kconfig"
-
endif
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 65ee1b784a30..e60161bec850 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -100,6 +100,7 @@ static inline u64 get_u64(const struct canfd_frame *cp, int offset)
struct bcm_op {
struct list_head list;
+ struct rcu_head rcu;
int ifindex;
canid_t can_id;
u32 flags;
@@ -718,10 +719,9 @@ static struct bcm_op *bcm_find_op(struct list_head *ops,
return NULL;
}
-static void bcm_remove_op(struct bcm_op *op)
+static void bcm_free_op_rcu(struct rcu_head *rcu_head)
{
- hrtimer_cancel(&op->timer);
- hrtimer_cancel(&op->thrtimer);
+ struct bcm_op *op = container_of(rcu_head, struct bcm_op, rcu);
if ((op->frames) && (op->frames != &op->sframe))
kfree(op->frames);
@@ -732,6 +732,14 @@ static void bcm_remove_op(struct bcm_op *op)
kfree(op);
}
+static void bcm_remove_op(struct bcm_op *op)
+{
+ hrtimer_cancel(&op->timer);
+ hrtimer_cancel(&op->thrtimer);
+
+ call_rcu(&op->rcu, bcm_free_op_rcu);
+}
+
static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op)
{
if (op->rx_reg_dev == dev) {
@@ -757,6 +765,9 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh,
if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
(op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
+ /* disable automatic timer on frame reception */
+ op->flags |= RX_NO_AUTOTIMER;
+
/*
* Don't care if we're bound or not (due to netdev
* problems) can_rx_unregister() is always a save
@@ -785,7 +796,6 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh,
bcm_rx_handler, op);
list_del(&op->list);
- synchronize_rcu();
bcm_remove_op(op);
return 1; /* done */
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 8958c4227b67..978ed0622d8f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -397,16 +397,18 @@ static void list_netdevice(struct net_device *dev)
/* Device list removal
* caller must respect a RCU grace period before freeing/reusing dev
*/
-static void unlist_netdevice(struct net_device *dev)
+static void unlist_netdevice(struct net_device *dev, bool lock)
{
ASSERT_RTNL();
/* Unlink dev from the device chain */
- write_lock(&dev_base_lock);
+ if (lock)
+ write_lock(&dev_base_lock);
list_del_rcu(&dev->dev_list);
netdev_name_node_del(dev->name_node);
hlist_del_rcu(&dev->index_hlist);
- write_unlock(&dev_base_lock);
+ if (lock)
+ write_unlock(&dev_base_lock);
dev_base_seq_inc(dev_net(dev));
}
@@ -10061,11 +10063,11 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
ret = netdev_register_kobject(dev);
- if (ret) {
- dev->reg_state = NETREG_UNREGISTERED;
+ write_lock(&dev_base_lock);
+ dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
+ write_unlock(&dev_base_lock);
+ if (ret)
goto err_uninit;
- }
- dev->reg_state = NETREG_REGISTERED;
__netdev_update_features(dev);
@@ -10347,7 +10349,9 @@ void netdev_run_todo(void)
continue;
}
+ write_lock(&dev_base_lock);
dev->reg_state = NETREG_UNREGISTERED;
+ write_unlock(&dev_base_lock);
linkwatch_forget_dev(dev);
}
@@ -10828,9 +10832,10 @@ void unregister_netdevice_many(struct list_head *head)
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
- unlist_netdevice(dev);
-
+ write_lock(&dev_base_lock);
+ unlist_netdevice(dev, false);
dev->reg_state = NETREG_UNREGISTERING;
+ write_unlock(&dev_base_lock);
}
flush_all_backlogs();
@@ -10977,7 +10982,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
dev_close(dev);
/* And unlink it from device chain */
- unlist_netdevice(dev);
+ unlist_netdevice(dev, true);
synchronize_net();
diff --git a/net/core/filter.c b/net/core/filter.c
index 4fae91984359..4ef77ec5255e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6559,10 +6559,21 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
ifindex, proto, netns_id, flags);
if (sk) {
- sk = sk_to_full_sk(sk);
- if (!sk_fullsock(sk)) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (!sk_fullsock(sk2))
+ sk2 = NULL;
+ if (sk2 != sk) {
sock_gen_put(sk);
- return NULL;
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
}
}
@@ -6596,10 +6607,21 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
flags);
if (sk) {
- sk = sk_to_full_sk(sk);
- if (!sk_fullsock(sk)) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (!sk_fullsock(sk2))
+ sk2 = NULL;
+ if (sk2 != sk) {
sock_gen_put(sk);
- return NULL;
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
}
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d8ec70622ecb..6a8c2596ebab 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1579,7 +1579,7 @@ static void neigh_managed_work(struct work_struct *work)
list_for_each_entry(neigh, &tbl->managed_list, managed_list)
neigh_event_send_probe(neigh, NULL, false);
queue_delayed_work(system_power_efficient_wq, &tbl->managed_work,
- max(NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME), HZ));
+ NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS));
write_unlock_bh(&tbl->lock);
}
@@ -2100,7 +2100,9 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
nla_put_msecs(skb, NDTPA_PROXY_DELAY,
NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_LOCKTIME,
- NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD))
+ NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS,
+ NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD))
goto nla_put_failure;
return nla_nest_end(skb, nest);
@@ -2255,6 +2257,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 },
[NDTPA_PROXY_DELAY] = { .type = NLA_U64 },
[NDTPA_LOCKTIME] = { .type = NLA_U64 },
+ [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 },
};
static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2373,6 +2376,10 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
nla_get_msecs(tbp[i]));
call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
break;
+ case NDTPA_INTERVAL_PROBE_TIME_MS:
+ NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS,
+ nla_get_msecs(tbp[i]));
+ break;
case NDTPA_RETRANS_TIME:
NEIGH_VAR_SET(p, RETRANS_TIME,
nla_get_msecs(tbp[i]));
@@ -3562,6 +3569,22 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
return ret;
}
+static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp = *ctl;
+ int ret;
+
+ int min = msecs_to_jiffies(1);
+
+ tmp.extra1 = &min;
+ tmp.extra2 = NULL;
+
+ ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos);
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
@@ -3658,6 +3681,9 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
+#define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive)
+
#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
@@ -3676,6 +3702,8 @@ static struct neigh_sysctl_table {
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
+ NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS,
+ "interval_probe_time_ms"),
NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index d49fc974e630..d61afd21aab5 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -33,6 +33,7 @@ static const char fmt_dec[] = "%d\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
+/* Caller holds RTNL or dev_base_lock */
static inline int dev_isalive(const struct net_device *dev)
{
return dev->reg_state <= NETREG_REGISTERED;
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index f18e6e771993..b74905fcc3a1 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -389,7 +389,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
- nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache);
+ nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
+ pool->alloc.cache);
if (unlikely(!nr_pages))
return NULL;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 00bf35ee8205..c4a751781581 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -454,8 +454,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb->fclone = SKB_FCLONE_ORIG;
refcount_set(&fclones->fclone_ref, 1);
-
- fclones->skb2.fclone = SKB_FCLONE_CLONE;
}
return skb;
@@ -1513,6 +1511,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
refcount_read(&fclones->fclone_ref) == 1) {
n = &fclones->skb2;
refcount_set(&fclones->fclone_ref, 2);
+ n->fclone = SKB_FCLONE_CLONE;
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
@@ -3195,9 +3194,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
}
}
- to->truesize += len + plen;
- to->len += len + plen;
- to->data_len += len + plen;
+ skb_len_add(to, len + plen);
if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
skb_tx_error(from);
@@ -3634,13 +3631,8 @@ onlymerged:
tgt->ip_summed = CHECKSUM_PARTIAL;
skb->ip_summed = CHECKSUM_PARTIAL;
- /* Yak, is it really working this way? Some helper please? */
- skb->len -= shiftlen;
- skb->data_len -= shiftlen;
- skb->truesize -= shiftlen;
- tgt->len += shiftlen;
- tgt->data_len += shiftlen;
- tgt->truesize += shiftlen;
+ skb_len_add(skb, -shiftlen);
+ skb_len_add(tgt, shiftlen);
return shiftlen;
}
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 4b297d67edb7..266d3b7b7d0b 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -702,6 +702,11 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
write_lock_bh(&sk->sk_callback_lock);
+ if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) {
+ psock = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
if (sk->sk_user_data) {
psock = ERR_PTR(-EBUSY);
goto out;
diff --git a/net/core/sock.c b/net/core/sock.c
index 92a0296ccb18..4cb957d934a2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2870,6 +2870,7 @@ void __sk_flush_backlog(struct sock *sk)
__release_sock(sk);
spin_unlock_bh(&sk->sk_lock.slock);
}
+EXPORT_SYMBOL_GPL(__sk_flush_backlog);
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index fbd98ac853ea..7c569bcc0aca 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -94,6 +94,7 @@ struct neigh_table dn_neigh_table = {
[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
[NEIGH_VAR_PROXY_QLEN] = 0,
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 8cb87b5067ee..3eef72ce99a4 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -87,10 +87,10 @@ config NET_DSA_TAG_MTK
Mediatek switches.
config NET_DSA_TAG_KSZ
- tristate "Tag driver for Microchip 8795/9477/9893 families of switches"
+ tristate "Tag driver for Microchip 8795/937x/9477/9893 families of switches"
help
Say Y if you want to enable support for tagging frames for the
- Microchip 8795/9477/9893 families of switches.
+ Microchip 8795/937x/9477/9893 families of switches.
config NET_DSA_TAG_OCELOT
tristate "Tag driver for Ocelot family of switches, using NPI port"
@@ -132,6 +132,13 @@ config NET_DSA_TAG_RTL8_4
Say Y or M if you want to enable support for tagging frames for Realtek
switches with 8 byte protocol 4 tags, such as the Realtek RTL8365MB-VC.
+config NET_DSA_TAG_RZN1_A5PSW
+ tristate "Tag driver for Renesas RZ/N1 A5PSW switch"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ Renesas RZ/N1 embedded switch that uses an 8 byte tag located after
+ destination MAC address.
+
config NET_DSA_TAG_LAN9303
tristate "Tag driver for SMSC/Microchip LAN9303 family of switches"
help
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 9f75820e7c98..af28c24ead18 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o
obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o
obj-$(CONFIG_NET_DSA_TAG_RTL8_4) += tag_rtl8_4.o
+obj-$(CONFIG_NET_DSA_TAG_RZN1_A5PSW) += tag_rzn1_a5psw.o
obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o
obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 2e1ac638d135..ad6a6663feeb 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1002,6 +1002,18 @@ dsa_slave_get_eth_ctrl_stats(struct net_device *dev,
ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats);
}
+static void
+dsa_slave_get_rmon_stats(struct net_device *dev,
+ struct ethtool_rmon_stats *rmon_stats,
+ const struct ethtool_rmon_hist_range **ranges)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_rmon_stats)
+ ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges);
+}
+
static void dsa_slave_net_selftest(struct net_device *ndev,
struct ethtool_test *etest, u64 *buf)
{
@@ -1097,6 +1109,16 @@ static int dsa_slave_set_link_ksettings(struct net_device *dev,
return phylink_ethtool_ksettings_set(dp->pl, cmd);
}
+static void dsa_slave_get_pause_stats(struct net_device *dev,
+ struct ethtool_pause_stats *pause_stats)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_pause_stats)
+ ds->ops->get_pause_stats(ds, dp->index, pause_stats);
+}
+
static void dsa_slave_get_pauseparam(struct net_device *dev,
struct ethtool_pauseparam *pause)
{
@@ -2081,12 +2103,14 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_eth_phy_stats = dsa_slave_get_eth_phy_stats,
.get_eth_mac_stats = dsa_slave_get_eth_mac_stats,
.get_eth_ctrl_stats = dsa_slave_get_eth_ctrl_stats,
+ .get_rmon_stats = dsa_slave_get_rmon_stats,
.set_wol = dsa_slave_set_wol,
.get_wol = dsa_slave_get_wol,
.set_eee = dsa_slave_set_eee,
.get_eee = dsa_slave_get_eee,
.get_link_ksettings = dsa_slave_get_link_ksettings,
.set_link_ksettings = dsa_slave_set_link_ksettings,
+ .get_pause_stats = dsa_slave_get_pause_stats,
.get_pauseparam = dsa_slave_get_pauseparam,
.set_pauseparam = dsa_slave_set_pauseparam,
.get_rxnfc = dsa_slave_get_rxnfc,
@@ -2460,8 +2484,9 @@ static int dsa_slave_changeupper(struct net_device *dev,
if (!err)
dsa_bridge_mtu_normalization(dp);
if (err == -EOPNOTSUPP) {
- NL_SET_ERR_MSG_MOD(extack,
- "Offloading not supported");
+ if (!extack->_msg)
+ NL_SET_ERR_MSG_MOD(extack,
+ "Offloading not supported");
err = 0;
}
err = notifier_from_errno(err);
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 3509fc967ca9..38fa19c1e2d5 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -193,10 +193,69 @@ static const struct dsa_device_ops ksz9893_netdev_ops = {
DSA_TAG_DRIVER(ksz9893_netdev_ops);
MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893);
+/* For xmit, 2 bytes are added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag0 : represents tag override, lookup and valid
+ * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x80=port8)
+ *
+ * For rcv, 1 byte is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag0 : zero-based value represents port
+ * (eg, 0x00=port1, 0x02=port3, 0x07=port8)
+ */
+#define LAN937X_EGRESS_TAG_LEN 2
+
+#define LAN937X_TAIL_TAG_BLOCKING_OVERRIDE BIT(11)
+#define LAN937X_TAIL_TAG_LOOKUP BIT(12)
+#define LAN937X_TAIL_TAG_VALID BIT(13)
+#define LAN937X_TAIL_TAG_PORT_MASK 7
+
+static struct sk_buff *lan937x_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ const struct ethhdr *hdr = eth_hdr(skb);
+ __be16 *tag;
+ u16 val;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ tag = skb_put(skb, LAN937X_EGRESS_TAG_LEN);
+
+ val = BIT(dp->index);
+
+ if (is_link_local_ether_addr(hdr->h_dest))
+ val |= LAN937X_TAIL_TAG_BLOCKING_OVERRIDE;
+
+ /* Tail tag valid bit - This bit should always be set by the CPU */
+ val |= LAN937X_TAIL_TAG_VALID;
+
+ put_unaligned_be16(val, tag);
+
+ return skb;
+}
+
+static const struct dsa_device_ops lan937x_netdev_ops = {
+ .name = "lan937x",
+ .proto = DSA_TAG_PROTO_LAN937X,
+ .xmit = lan937x_xmit,
+ .rcv = ksz9477_rcv,
+ .needed_tailroom = LAN937X_EGRESS_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(lan937x_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN937X);
+
static struct dsa_tag_driver *dsa_tag_driver_array[] = {
&DSA_TAG_DRIVER_NAME(ksz8795_netdev_ops),
&DSA_TAG_DRIVER_NAME(ksz9477_netdev_ops),
&DSA_TAG_DRIVER_NAME(ksz9893_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(lan937x_netdev_ops),
};
module_dsa_tag_drivers(dsa_tag_driver_array);
diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c
new file mode 100644
index 000000000000..e2a5ee6ae688
--- /dev/null
+++ b/net/dsa/tag_rzn1_a5psw.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 Schneider Electric
+ *
+ * Clément Léger <clement.leger@bootlin.com>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <net/dsa.h>
+
+#include "dsa_priv.h"
+
+/* To define the outgoing port and to discover the incoming port a TAG is
+ * inserted after Src MAC :
+ *
+ * Dest MAC Src MAC TAG Type
+ * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 5 6 7 8 | 1 2 |...
+ * |<--------------->|
+ *
+ * See struct a5psw_tag for layout
+ */
+
+#define ETH_P_DSA_A5PSW 0xE001
+#define A5PSW_TAG_LEN 8
+#define A5PSW_CTRL_DATA_FORCE_FORWARD BIT(0)
+/* This is both used for xmit tag and rcv tagging */
+#define A5PSW_CTRL_DATA_PORT GENMASK(3, 0)
+
+struct a5psw_tag {
+ __be16 ctrl_tag;
+ __be16 ctrl_data;
+ __be16 ctrl_data2_hi;
+ __be16 ctrl_data2_lo;
+};
+
+static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct a5psw_tag *ptag;
+ u32 data2_val;
+
+ BUILD_BUG_ON(sizeof(*ptag) != A5PSW_TAG_LEN);
+
+ /* The Ethernet switch we are interfaced with needs packets to be at
+ * least 60 bytes otherwise they will be discarded when they enter the
+ * switch port logic.
+ */
+ if (__skb_put_padto(skb, ETH_ZLEN, false))
+ return NULL;
+
+ /* provide 'A5PSW_TAG_LEN' bytes additional space */
+ skb_push(skb, A5PSW_TAG_LEN);
+
+ /* make room between MACs and Ether-Type to insert tag */
+ dsa_alloc_etype_header(skb, A5PSW_TAG_LEN);
+
+ ptag = dsa_etype_header_pos_tx(skb);
+
+ data2_val = FIELD_PREP(A5PSW_CTRL_DATA_PORT, BIT(dp->index));
+ ptag->ctrl_tag = htons(ETH_P_DSA_A5PSW);
+ ptag->ctrl_data = htons(A5PSW_CTRL_DATA_FORCE_FORWARD);
+ ptag->ctrl_data2_lo = htons(data2_val);
+ ptag->ctrl_data2_hi = 0;
+
+ return skb;
+}
+
+static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct a5psw_tag *tag;
+ int port;
+
+ if (unlikely(!pskb_may_pull(skb, A5PSW_TAG_LEN))) {
+ dev_warn_ratelimited(&dev->dev,
+ "Dropping packet, cannot pull\n");
+ return NULL;
+ }
+
+ tag = dsa_etype_header_pos_rx(skb);
+
+ if (tag->ctrl_tag != htons(ETH_P_DSA_A5PSW)) {
+ dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid TAG marker\n");
+ return NULL;
+ }
+
+ port = FIELD_GET(A5PSW_CTRL_DATA_PORT, ntohs(tag->ctrl_data));
+
+ skb->dev = dsa_master_find_slave(dev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ skb_pull_rcsum(skb, A5PSW_TAG_LEN);
+ dsa_strip_etype_header(skb, A5PSW_TAG_LEN);
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops a5psw_netdev_ops = {
+ .name = "a5psw",
+ .proto = DSA_TAG_PROTO_RZN1_A5PSW,
+ .xmit = a5psw_tag_xmit,
+ .rcv = a5psw_tag_rcv,
+ .needed_headroom = A5PSW_TAG_LEN,
+};
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_A5PSW);
+module_dsa_tag_driver(a5psw_netdev_ops);
diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c
index 7e6b37a54add..1c94bb8ea03f 100644
--- a/net/ethtool/eeprom.c
+++ b/net/ethtool/eeprom.c
@@ -36,7 +36,7 @@ static int fallback_set_params(struct eeprom_req_info *request,
if (request->page)
offset = request->page * ETH_MODULE_EEPROM_PAGE_LEN + offset;
- if (modinfo->type == ETH_MODULE_SFF_8079 &&
+ if (modinfo->type == ETH_MODULE_SFF_8472 &&
request->i2c_address == 0x51)
offset += ETH_MODULE_EEPROM_PAGE_LEN * 2;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index ab4a5601c82a..af2f12ffc9ca 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -168,6 +168,7 @@ struct neigh_table arp_tbl = {
[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
[NEIGH_VAR_PROXY_QLEN] = 64,
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b21238df3301..7eae8d686e20 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -502,9 +502,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
nfrags++;
- skb->len += tailen;
- skb->data_len += tailen;
- skb->truesize += tailen;
+ skb_len_add(skb, tailen);
if (sk && sk_fullsock(sk))
refcount_add(tailen, &sk->sk_wmem_alloc);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 3b9cd487075a..5c58e21f724e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -524,7 +524,6 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
int tunnel_hlen;
int version;
int nhoff;
- int thoff;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -558,10 +557,16 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
(ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
truncate = true;
- thoff = skb_transport_header(skb) - skb_mac_header(skb);
- if (skb->protocol == htons(ETH_P_IPV6) &&
- (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
- truncate = true;
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ int thoff;
+
+ if (skb_transport_header_was_set(skb))
+ thoff = skb_transport_header(skb) - skb_mac_header(skb);
+ else
+ thoff = nhoff + sizeof(struct ipv6hdr);
+ if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
+ truncate = true;
+ }
if (version == 1) {
erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 00b4bf26fd93..5e32a2f86fbd 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1214,9 +1214,7 @@ alloc_new_skb:
pfrag->offset += copy;
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
- skb->len += copy;
- skb->data_len += copy;
- skb->truesize += copy;
+ skb_len_add(skb, copy);
wmem_alloc_delta += copy;
} else {
err = skb_zerocopy_iter_dgram(skb, from, copy);
@@ -1443,9 +1441,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
skb->csum = csum_block_add(skb->csum, csum, skb->len);
}
- skb->len += len;
- skb->data_len += len;
- skb->truesize += len;
+ skb_len_add(skb, len);
refcount_add(len, &sk->sk_wmem_alloc);
offset += len;
size -= len;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6b2dc7b2b612..cc1caab4a654 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -410,7 +410,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
u32 mtu = dst_mtu(encap_dst) - headroom;
if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
- (!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu))
+ (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu))
return 0;
skb_dst_update_pmtu_no_confirm(skb, mtu);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 9d41d5d5cd1e..f53a0f2453af 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1759,15 +1759,15 @@ static int __init ip_auto_config_setup(char *addrs)
case 4:
if ((dp = strchr(ip, '.'))) {
*dp++ = '\0';
- strlcpy(utsname()->domainname, dp,
+ strscpy(utsname()->domainname, dp,
sizeof(utsname()->domainname));
}
- strlcpy(utsname()->nodename, ip,
+ strscpy(utsname()->nodename, ip,
sizeof(utsname()->nodename));
ic_host_name_set = 1;
break;
case 5:
- strlcpy(user_dev_name, ip, sizeof(user_dev_name));
+ strscpy(user_dev_name, ip, sizeof(user_dev_name));
break;
case 6:
if (ic_proto_name(ip) == 0 &&
@@ -1814,7 +1814,7 @@ __setup("nfsaddrs=", nfsaddrs_config_setup);
static int __init vendor_class_identifier_setup(char *addrs)
{
- if (strlcpy(vendor_class_identifier, addrs,
+ if (strscpy(vendor_class_identifier, addrs,
sizeof(vendor_class_identifier))
>= sizeof(vendor_class_identifier))
pr_warn("DHCP: vendorclass too long, truncated to \"%s\"\n",
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8324e541d193..73651d17e51f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -77,7 +77,12 @@ struct ipmr_result {
* Note that the changes are semaphored via rtnl_lock.
*/
-static DEFINE_RWLOCK(mrt_lock);
+static DEFINE_SPINLOCK(mrt_lock);
+
+static struct net_device *vif_dev_read(const struct vif_device *vif)
+{
+ return rcu_dereference(vif->dev);
+}
/* Multicast router control variables */
@@ -100,11 +105,11 @@ static void ipmr_free_table(struct mr_table *mrt);
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
struct net_device *dev, struct sk_buff *skb,
struct mfc_cache *cache, int local);
-static int ipmr_cache_report(struct mr_table *mrt,
+static int ipmr_cache_report(const struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert);
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
int cmd);
-static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
+static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt);
static void mroute_clean_tables(struct mr_table *mrt, int flags);
static void ipmr_expire_process(struct timer_list *t);
@@ -501,11 +506,15 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
return err;
}
- read_lock(&mrt_lock);
dev->stats.tx_bytes += skb->len;
dev->stats.tx_packets++;
- ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
- read_unlock(&mrt_lock);
+ rcu_read_lock();
+
+ /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
+ ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
+ IGMPMSG_WHOLEPKT);
+
+ rcu_read_unlock();
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -572,6 +581,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
{
struct net_device *reg_dev = NULL;
struct iphdr *encap;
+ int vif_num;
encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
/* Check that:
@@ -584,11 +594,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
ntohs(encap->tot_len) + pimlen > skb->len)
return 1;
- read_lock(&mrt_lock);
- if (mrt->mroute_reg_vif_num >= 0)
- reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
- read_unlock(&mrt_lock);
-
+ /* Pairs with WRITE_ONCE() in vif_add()/vid_delete() */
+ vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
+ if (vif_num >= 0)
+ reg_dev = vif_dev_read(&mrt->vif_table[vif_num]);
if (!reg_dev)
return 1;
@@ -614,10 +623,11 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
static int call_ipmr_vif_entry_notifiers(struct net *net,
enum fib_event_type event_type,
struct vif_device *vif,
+ struct net_device *vif_dev,
vifi_t vif_index, u32 tb_id)
{
return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type,
- vif, vif_index, tb_id,
+ vif, vif_dev, vif_index, tb_id,
&net->ipv4.ipmr_seq);
}
@@ -649,22 +659,19 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
v = &mrt->vif_table[vifi];
- if (VIF_EXISTS(mrt, vifi))
- call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi,
- mrt->id);
-
- write_lock_bh(&mrt_lock);
- dev = v->dev;
- v->dev = NULL;
-
- if (!dev) {
- write_unlock_bh(&mrt_lock);
+ dev = rtnl_dereference(v->dev);
+ if (!dev)
return -EADDRNOTAVAIL;
- }
- if (vifi == mrt->mroute_reg_vif_num)
- mrt->mroute_reg_vif_num = -1;
+ spin_lock(&mrt_lock);
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev,
+ vifi, mrt->id);
+ RCU_INIT_POINTER(v->dev, NULL);
+ if (vifi == mrt->mroute_reg_vif_num) {
+ /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, -1);
+ }
if (vifi + 1 == mrt->maxvif) {
int tmp;
@@ -672,10 +679,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
if (VIF_EXISTS(mrt, tmp))
break;
}
- mrt->maxvif = tmp+1;
+ WRITE_ONCE(mrt->maxvif, tmp + 1);
}
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
dev_set_allmulti(dev, -1);
@@ -777,7 +784,7 @@ out:
spin_unlock(&mfc_unres_lock);
}
-/* Fill oifs list. It is called under write locked mrt_lock. */
+/* Fill oifs list. It is called under locked mrt_lock. */
static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
unsigned char *ttls)
{
@@ -889,15 +896,18 @@ static int vif_add(struct net *net, struct mr_table *mrt,
v->remote = vifc->vifc_rmt_addr.s_addr;
/* And finish update writing critical data */
- write_lock_bh(&mrt_lock);
- v->dev = dev;
+ spin_lock(&mrt_lock);
+ rcu_assign_pointer(v->dev, dev);
netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
- if (v->flags & VIFF_REGISTER)
- mrt->mroute_reg_vif_num = vifi;
+ if (v->flags & VIFF_REGISTER) {
+ /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, vifi);
+ }
if (vifi+1 > mrt->maxvif)
- mrt->maxvif = vifi+1;
- write_unlock_bh(&mrt_lock);
- call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id);
+ WRITE_ONCE(mrt->maxvif, vifi + 1);
+ spin_unlock(&mrt_lock);
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev,
+ vifi, mrt->id);
return 0;
}
@@ -1001,9 +1011,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
/* Bounce a cache query up to mrouted and netlink.
*
- * Called under mrt_lock.
+ * Called under rcu_read_lock().
*/
-static int ipmr_cache_report(struct mr_table *mrt,
+static int ipmr_cache_report(const struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert)
{
const int ihl = ip_hdrlen(pkt);
@@ -1038,8 +1048,11 @@ static int ipmr_cache_report(struct mr_table *mrt,
msg->im_vif = vifi;
msg->im_vif_hi = vifi >> 8;
} else {
- msg->im_vif = mrt->mroute_reg_vif_num;
- msg->im_vif_hi = mrt->mroute_reg_vif_num >> 8;
+ /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
+ int vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
+
+ msg->im_vif = vif_num;
+ msg->im_vif_hi = vif_num >> 8;
}
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
@@ -1064,10 +1077,8 @@ static int ipmr_cache_report(struct mr_table *mrt,
skb->transport_header = skb->network_header;
}
- rcu_read_lock();
mroute_sk = rcu_dereference(mrt->mroute_sk);
if (!mroute_sk) {
- rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
}
@@ -1076,7 +1087,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
/* Deliver to mrouted */
ret = sock_queue_rcv_skb(mroute_sk, skb);
- rcu_read_unlock();
+
if (ret < 0) {
net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
kfree_skb(skb);
@@ -1086,6 +1097,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
}
/* Queue a packet for resolution. It gets locked cache entry! */
+/* Called under rcu_read_lock() */
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
struct sk_buff *skb, struct net_device *dev)
{
@@ -1198,12 +1210,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
mfc->mfcc_mcastgrp.s_addr, parent);
rcu_read_unlock();
if (c) {
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
c->_c.mfc_parent = mfc->mfcc_parent;
ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
if (!mrtsock)
c->_c.mfc_flags |= MFC_STATIC;
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
mrt->id);
mroute_netlink_event(mrt, c, RTM_NEWROUTE);
@@ -1598,20 +1610,20 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
if (vr.vifi >= mrt->maxvif)
return -EINVAL;
vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
- read_lock(&mrt_lock);
+ rcu_read_lock();
vif = &mrt->vif_table[vr.vifi];
if (VIF_EXISTS(mrt, vr.vifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT:
if (copy_from_user(&sr, arg, sizeof(sr)))
@@ -1673,20 +1685,20 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
if (vr.vifi >= mrt->maxvif)
return -EINVAL;
vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
- read_lock(&mrt_lock);
+ rcu_read_lock();
vif = &mrt->vif_table[vr.vifi];
if (VIF_EXISTS(mrt, vr.vifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT:
if (copy_from_user(&sr, arg, sizeof(sr)))
@@ -1726,7 +1738,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
ipmr_for_each_table(mrt, net) {
v = &mrt->vif_table[0];
for (ct = 0; ct < mrt->maxvif; ct++, v++) {
- if (v->dev == dev)
+ if (rcu_access_pointer(v->dev) == dev)
vif_delete(mrt, ct, 1, NULL);
}
}
@@ -1804,26 +1816,28 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
}
#endif
-/* Processing handlers for ipmr_forward */
+/* Processing handlers for ipmr_forward, under rcu_read_lock() */
static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
int in_vifi, struct sk_buff *skb, int vifi)
{
const struct iphdr *iph = ip_hdr(skb);
struct vif_device *vif = &mrt->vif_table[vifi];
+ struct net_device *vif_dev;
struct net_device *dev;
struct rtable *rt;
struct flowi4 fl4;
int encap = 0;
- if (!vif->dev)
+ vif_dev = vif_dev_read(vif);
+ if (!vif_dev)
goto out_free;
if (vif->flags & VIFF_REGISTER) {
- vif->pkt_out++;
- vif->bytes_out += skb->len;
- vif->dev->stats.tx_bytes += skb->len;
- vif->dev->stats.tx_packets++;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
+ vif_dev->stats.tx_bytes += skb->len;
+ vif_dev->stats.tx_packets++;
ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
goto out_free;
}
@@ -1868,8 +1882,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
goto out_free;
}
- vif->pkt_out++;
- vif->bytes_out += skb->len;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
@@ -1881,8 +1895,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
if (vif->flags & VIFF_TUNNEL) {
ip_encap(net, skb, vif->local, vif->remote);
/* FIXME: extra output firewall step used to be here. --RR */
- vif->dev->stats.tx_packets++;
- vif->dev->stats.tx_bytes += skb->len;
+ vif_dev->stats.tx_packets++;
+ vif_dev->stats.tx_bytes += skb->len;
}
IPCB(skb)->flags |= IPSKB_FORWARDED;
@@ -1906,18 +1920,20 @@ out_free:
kfree_skb(skb);
}
-static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
+/* Called with mrt_lock or rcu_read_lock() */
+static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev)
{
int ct;
-
- for (ct = mrt->maxvif-1; ct >= 0; ct--) {
- if (mrt->vif_table[ct].dev == dev)
+ /* Pairs with WRITE_ONCE() in vif_delete()/vif_add() */
+ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) {
+ if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
break;
}
return ct;
}
/* "local" means that we should preserve one skb (for local delivery) */
+/* Called uner rcu_read_lock() */
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
struct net_device *dev, struct sk_buff *skb,
struct mfc_cache *c, int local)
@@ -1944,7 +1960,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
}
/* Wrong interface: drop packet and (maybe) send PIM assert. */
- if (mrt->vif_table[vif].dev != dev) {
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -1983,8 +1999,10 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
}
forward:
- mrt->vif_table[vif].pkt_in++;
- mrt->vif_table[vif].bytes_in += skb->len;
+ WRITE_ONCE(mrt->vif_table[vif].pkt_in,
+ mrt->vif_table[vif].pkt_in + 1);
+ WRITE_ONCE(mrt->vif_table[vif].bytes_in,
+ mrt->vif_table[vif].bytes_in + skb->len);
/* Forward the frame */
if (c->mfc_origin == htonl(INADDR_ANY) &&
@@ -2140,22 +2158,14 @@ int ip_mr_input(struct sk_buff *skb)
skb = skb2;
}
- read_lock(&mrt_lock);
vif = ipmr_find_vif(mrt, dev);
- if (vif >= 0) {
- int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev);
- read_unlock(&mrt_lock);
-
- return err2;
- }
- read_unlock(&mrt_lock);
+ if (vif >= 0)
+ return ipmr_cache_unresolved(mrt, vif, skb, dev);
kfree_skb(skb);
return -ENODEV;
}
- read_lock(&mrt_lock);
ip_mr_forward(net, mrt, dev, skb, cache, local);
- read_unlock(&mrt_lock);
if (local)
return ip_local_deliver(skb);
@@ -2252,18 +2262,15 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
int vif = -1;
dev = skb->dev;
- read_lock(&mrt_lock);
if (dev)
vif = ipmr_find_vif(mrt, dev);
if (vif < 0) {
- read_unlock(&mrt_lock);
rcu_read_unlock();
return -ENODEV;
}
skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr));
if (!skb2) {
- read_unlock(&mrt_lock);
rcu_read_unlock();
return -ENOMEM;
}
@@ -2277,14 +2284,11 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
iph->daddr = daddr;
iph->version = 0;
err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
- read_unlock(&mrt_lock);
rcu_read_unlock();
return err;
}
- read_lock(&mrt_lock);
err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
- read_unlock(&mrt_lock);
rcu_read_unlock();
return err;
}
@@ -2404,7 +2408,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
return len;
}
-static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
+static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt)
{
struct net *net = read_pnet(&mrt->net);
struct nlmsghdr *nlh;
@@ -2744,18 +2748,21 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
{
+ struct net_device *vif_dev;
struct nlattr *vif_nest;
struct vif_device *vif;
+ vif = &mrt->vif_table[vifid];
+ vif_dev = rtnl_dereference(vif->dev);
/* if the VIF doesn't exist just continue */
- if (!VIF_EXISTS(mrt, vifid))
+ if (!vif_dev)
return true;
- vif = &mrt->vif_table[vifid];
vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF);
if (!vif_nest)
return false;
- if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) ||
+
+ if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) ||
nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
@@ -2887,7 +2894,7 @@ out:
*/
static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(mrt_lock)
+ __acquires(RCU)
{
struct mr_vif_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -2899,14 +2906,14 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
iter->mrt = mrt;
- read_lock(&mrt_lock);
+ rcu_read_lock();
return mr_vif_seq_start(seq, pos);
}
static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
- __releases(mrt_lock)
+ __releases(RCU)
{
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
}
static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
@@ -2919,9 +2926,11 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
"Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
} else {
const struct vif_device *vif = v;
- const char *name = vif->dev ?
- vif->dev->name : "none";
+ const struct net_device *vif_dev;
+ const char *name;
+ vif_dev = vif_dev_read(vif);
+ name = vif_dev ? vif_dev->name : "none";
seq_printf(seq,
"%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
vif - mrt->vif_table,
@@ -3017,7 +3026,7 @@ static int ipmr_dump(struct net *net, struct notifier_block *nb,
struct netlink_ext_ack *extack)
{
return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
- ipmr_mr_table_iter, &mrt_lock, extack);
+ ipmr_mr_table_iter, extack);
}
static const struct fib_notifier_ops ipmr_notifier_ops_template = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index aa8738a91210..271dc03fc6db 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -13,7 +13,7 @@ void vif_device_init(struct vif_device *v,
unsigned short flags,
unsigned short get_iflink_mask)
{
- v->dev = NULL;
+ RCU_INIT_POINTER(v->dev, NULL);
v->bytes_in = 0;
v->bytes_out = 0;
v->pkt_in = 0;
@@ -208,6 +208,7 @@ EXPORT_SYMBOL(mr_mfc_seq_next);
int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct mr_mfc *c, struct rtmsg *rtm)
{
+ struct net_device *vif_dev;
struct rta_mfc_stats mfcs;
struct nlattr *mp_attr;
struct rtnexthop *nhp;
@@ -220,10 +221,13 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
return -ENOENT;
}
- if (VIF_EXISTS(mrt, c->mfc_parent) &&
- nla_put_u32(skb, RTA_IIF,
- mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+ rcu_read_lock();
+ vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev);
+ if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) {
+ rcu_read_unlock();
return -EMSGSIZE;
+ }
+ rcu_read_unlock();
if (c->mfc_flags & MFC_OFFLOAD)
rtm->rtm_flags |= RTNH_F_OFFLOAD;
@@ -232,23 +236,27 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
if (!mp_attr)
return -EMSGSIZE;
+ rcu_read_lock();
for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
- if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
- struct vif_device *vif;
+ struct vif_device *vif = &mrt->vif_table[ct];
+
+ vif_dev = rcu_dereference(vif->dev);
+ if (vif_dev && c->mfc_un.res.ttls[ct] < 255) {
nhp = nla_reserve_nohdr(skb, sizeof(*nhp));
if (!nhp) {
+ rcu_read_unlock();
nla_nest_cancel(skb, mp_attr);
return -EMSGSIZE;
}
nhp->rtnh_flags = 0;
nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
- vif = &mrt->vif_table[ct];
- nhp->rtnh_ifindex = vif->dev->ifindex;
+ nhp->rtnh_ifindex = vif_dev->ifindex;
nhp->rtnh_len = sizeof(*nhp);
}
}
+ rcu_read_unlock();
nla_nest_end(skb, mp_attr);
@@ -275,13 +283,14 @@ static bool mr_mfc_uses_dev(const struct mr_table *mrt,
int ct;
for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
- if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
- const struct vif_device *vif;
-
- vif = &mrt->vif_table[ct];
- if (vif->dev == dev)
- return true;
- }
+ const struct net_device *vif_dev;
+ const struct vif_device *vif;
+
+ vif = &mrt->vif_table[ct];
+ vif_dev = rcu_access_pointer(vif->dev);
+ if (vif_dev && c->mfc_un.res.ttls[ct] < 255 &&
+ vif_dev == dev)
+ return true;
}
return false;
}
@@ -390,7 +399,6 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
struct netlink_ext_ack *extack),
struct mr_table *(*mr_iter)(struct net *net,
struct mr_table *mrt),
- rwlock_t *mrt_lock,
struct netlink_ext_ack *extack)
{
struct mr_table *mrt;
@@ -402,22 +410,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) {
struct vif_device *v = &mrt->vif_table[0];
+ struct net_device *vif_dev;
struct mr_mfc *mfc;
int vifi;
/* Notifiy on table VIF entries */
- read_lock(mrt_lock);
+ rcu_read_lock();
for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
- if (!v->dev)
+ vif_dev = rcu_dereference(v->dev);
+ if (!vif_dev)
continue;
err = mr_call_vif_notifier(nb, family,
- FIB_EVENT_VIF_ADD,
- v, vifi, mrt->id, extack);
+ FIB_EVENT_VIF_ADD, v,
+ vif_dev, vifi,
+ mrt->id, extack);
if (err)
break;
}
- read_unlock(mrt_lock);
+ rcu_read_unlock();
if (err)
return err;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 3d6fc6d841b8..b83c2bd9d722 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -316,12 +316,16 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
+ if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
+ return 0;
+
tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
- if (!inet_addr_valid_or_nonlocal(net, inet_sk(sk),
- addr->sin_addr.s_addr,
- chk_addr_ret))
+ if (chk_addr_ret == RTN_MULTICAST ||
+ chk_addr_ret == RTN_BROADCAST ||
+ (chk_addr_ret != RTN_LOCAL &&
+ !inet_can_nonlocal_bind(net, isk)))
return -EADDRNOTAVAIL;
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 959bea12dc48..006c1f0ed8b4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -95,10 +95,10 @@ int raw_hash_sk(struct sock *sk)
hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
- write_lock_bh(&h->lock);
+ spin_lock(&h->lock);
__sk_nulls_add_node_rcu(sk, hlist);
sock_set_flag(sk, SOCK_RCU_FREE);
- write_unlock_bh(&h->lock);
+ spin_unlock(&h->lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
return 0;
@@ -109,10 +109,10 @@ void raw_unhash_sk(struct sock *sk)
{
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
- write_lock_bh(&h->lock);
+ spin_lock(&h->lock);
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- write_unlock_bh(&h->lock);
+ spin_unlock(&h->lock);
}
EXPORT_SYMBOL_GPL(raw_unhash_sk);
@@ -278,7 +278,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
sk_nulls_for_each(sk, hnode, hlist) {
iph = (const struct iphdr *)skb->data;
if (!raw_v4_match(net, sk, iph->protocol,
- iph->saddr, iph->daddr, dif, sdif))
+ iph->daddr, iph->saddr, dif, sdif))
continue;
raw_err(sk, skb, info);
}
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index ac4b6525d3c6..999321834b94 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -156,7 +156,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
s_slot = cb->args[0];
num = s_num = cb->args[1];
- read_lock(&hashinfo->lock);
+ rcu_read_lock();
for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
num = 0;
@@ -184,7 +184,7 @@ next:
}
out_unlock:
- read_unlock(&hashinfo->lock);
+ rcu_read_unlock();
cb->args[0] = slot;
cb->args[1] = num;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9d2fd3ced21b..21bdee88383b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4575,16 +4575,24 @@ EXPORT_SYMBOL_GPL(tcp_done);
int tcp_abort(struct sock *sk, int err)
{
- if (!sk_fullsock(sk)) {
- if (sk->sk_state == TCP_NEW_SYN_RECV) {
- struct request_sock *req = inet_reqsk(sk);
+ int state = inet_sk_state_load(sk);
- local_bh_disable();
- inet_csk_reqsk_queue_drop(req->rsk_listener, req);
- local_bh_enable();
- return 0;
- }
- return -EOPNOTSUPP;
+ if (state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+
+ local_bh_disable();
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ local_bh_enable();
+ return 0;
+ }
+ if (state == TCP_TIME_WAIT) {
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+
+ refcount_inc(&tw->tw_refcnt);
+ local_bh_disable();
+ inet_twsk_deschedule_put(tw);
+ local_bh_enable();
+ return 0;
}
/* Don't race with userspace socket closes such as tcp_close. */
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 38550bb1b90b..a1626afe87a1 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -612,9 +612,6 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
return 0;
}
- if (inet_csk_has_ulp(sk))
- return -EINVAL;
-
if (sk->sk_family == AF_INET6) {
if (tcp_bpf_assert_proto_ops(psock->sk_proto))
return -EINVAL;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fda811a5251f..68d0d8a008e2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1964,7 +1964,10 @@ process:
struct sock *nsk;
sk = req->rsk_listener;
- drop_reason = tcp_inbound_md5_hash(sk, skb,
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
+ else
+ drop_reason = tcp_inbound_md5_hash(sk, skb,
&iph->saddr, &iph->daddr,
AF_INET, dif, sdif);
if (unlikely(drop_reason)) {
@@ -2016,6 +2019,7 @@ process:
}
goto discard_and_relse;
}
+ nf_reset_ct(skb);
if (nsk == sk) {
reqsk_put(req);
tcp_v4_restore_cb(skb);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3497ad1362c0..88becb037eb6 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1109,10 +1109,6 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
goto out;
}
- if (net->ipv6.devconf_all->disable_policy ||
- idev->cnf.disable_policy)
- f6i->dst_nopolicy = true;
-
neigh_parms_data_state_setall(idev->nd_parms);
ifa->addr = *cfg->pfx;
@@ -4524,6 +4520,39 @@ restart:
/* We try to batch several events at once. */
age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+ if ((ifp->flags&IFA_F_TEMPORARY) &&
+ !(ifp->flags&IFA_F_TENTATIVE) &&
+ ifp->prefered_lft != INFINITY_LIFE_TIME &&
+ !ifp->regen_count && ifp->ifpub) {
+ /* This is a non-regenerated temporary addr. */
+
+ unsigned long regen_advance = ifp->idev->cnf.regen_max_retry *
+ ifp->idev->cnf.dad_transmits *
+ max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
+
+ if (age + regen_advance >= ifp->prefered_lft) {
+ struct inet6_ifaddr *ifpub = ifp->ifpub;
+ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
+ next = ifp->tstamp + ifp->prefered_lft * HZ;
+
+ ifp->regen_count++;
+ in6_ifa_hold(ifp);
+ in6_ifa_hold(ifpub);
+ spin_unlock(&ifp->lock);
+
+ spin_lock(&ifpub->lock);
+ ifpub->regen_count = 0;
+ spin_unlock(&ifpub->lock);
+ rcu_read_unlock_bh();
+ ipv6_create_tempaddr(ifpub, true);
+ in6_ifa_put(ifpub);
+ in6_ifa_put(ifp);
+ rcu_read_lock_bh();
+ goto restart;
+ } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
+ next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
+ }
+
if (ifp->valid_lft != INFINITY_LIFE_TIME &&
age >= ifp->valid_lft) {
spin_unlock(&ifp->lock);
@@ -4557,35 +4586,6 @@ restart:
in6_ifa_put(ifp);
goto restart;
}
- } else if ((ifp->flags&IFA_F_TEMPORARY) &&
- !(ifp->flags&IFA_F_TENTATIVE)) {
- unsigned long regen_advance = ifp->idev->cnf.regen_max_retry *
- ifp->idev->cnf.dad_transmits *
- max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
-
- if (age >= ifp->prefered_lft - regen_advance) {
- struct inet6_ifaddr *ifpub = ifp->ifpub;
- if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
- next = ifp->tstamp + ifp->prefered_lft * HZ;
- if (!ifp->regen_count && ifpub) {
- ifp->regen_count++;
- in6_ifa_hold(ifp);
- in6_ifa_hold(ifpub);
- spin_unlock(&ifp->lock);
-
- spin_lock(&ifpub->lock);
- ifpub->regen_count = 0;
- spin_unlock(&ifpub->lock);
- rcu_read_unlock_bh();
- ipv6_create_tempaddr(ifpub, true);
- in6_ifa_put(ifpub);
- in6_ifa_put(ifp);
- rcu_read_lock_bh();
- goto restart;
- }
- } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
- next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
- spin_unlock(&ifp->lock);
} else {
/* ifp->prefered_lft <= ifp->valid_lft */
if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
@@ -5172,9 +5172,9 @@ next:
fillargs->event = RTM_GETMULTICAST;
/* multicast address */
- for (ifmca = rcu_dereference(idev->mc_list);
+ for (ifmca = rtnl_dereference(idev->mc_list);
ifmca;
- ifmca = rcu_dereference(ifmca->next), ip_idx++) {
+ ifmca = rtnl_dereference(ifmca->next), ip_idx++) {
if (ip_idx < s_ip_idx)
continue;
err = inet6_fill_ifmcaddr(skb, ifmca, fillargs);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 3e22cbe5966a..1bd10ae332e8 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -939,7 +939,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
__be16 proto;
__u32 mtu;
int nhoff;
- int thoff;
if (!pskb_inet_may_pull(skb))
goto tx_err;
@@ -960,10 +959,16 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
(ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
truncate = true;
- thoff = skb_transport_header(skb) - skb_mac_header(skb);
- if (skb->protocol == htons(ETH_P_IPV6) &&
- (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
- truncate = true;
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ int thoff;
+
+ if (skb_transport_header_was_set(skb))
+ thoff = skb_transport_header(skb) - skb_mac_header(skb);
+ else
+ thoff = nhoff + sizeof(struct ipv6hdr);
+ if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
+ truncate = true;
+ }
if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
goto tx_err;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d4aad41c9849..ec6e1509fc7c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -62,7 +62,12 @@ struct ip6mr_result {
Note that the changes are semaphored via rtnl_lock.
*/
-static DEFINE_RWLOCK(mrt_lock);
+static DEFINE_SPINLOCK(mrt_lock);
+
+static struct net_device *vif_dev_read(const struct vif_device *vif)
+{
+ return rcu_dereference(vif->dev);
+}
/* Multicast router control variables */
@@ -85,11 +90,11 @@ static void ip6mr_free_table(struct mr_table *mrt);
static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
struct net_device *dev, struct sk_buff *skb,
struct mfc6_cache *cache);
-static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
+static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt,
mifi_t mifi, int assert);
static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
int cmd);
-static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
+static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt);
static int ip6mr_rtm_dumproute(struct sk_buff *skb,
struct netlink_callback *cb);
static void mroute_clean_tables(struct mr_table *mrt, int flags);
@@ -398,7 +403,7 @@ static void ip6mr_free_table(struct mr_table *mrt)
*/
static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(mrt_lock)
+ __acquires(RCU)
{
struct mr_vif_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -410,14 +415,14 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
iter->mrt = mrt;
- read_lock(&mrt_lock);
+ rcu_read_lock();
return mr_vif_seq_start(seq, pos);
}
static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
- __releases(mrt_lock)
+ __releases(RCU)
{
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
}
static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
@@ -430,7 +435,11 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
"Interface BytesIn PktsIn BytesOut PktsOut Flags\n");
} else {
const struct vif_device *vif = v;
- const char *name = vif->dev ? vif->dev->name : "none";
+ const struct net_device *vif_dev;
+ const char *name;
+
+ vif_dev = vif_dev_read(vif);
+ name = vif_dev ? vif_dev->name : "none";
seq_printf(seq,
"%2td %-10s %8ld %7ld %8ld %7ld %05X\n",
@@ -549,13 +558,11 @@ static int pim6_rcv(struct sk_buff *skb)
if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
goto drop;
- reg_vif_num = mrt->mroute_reg_vif_num;
- read_lock(&mrt_lock);
+ /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */
+ reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
if (reg_vif_num >= 0)
- reg_dev = mrt->vif_table[reg_vif_num].dev;
- dev_hold(reg_dev);
- read_unlock(&mrt_lock);
+ reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]);
if (!reg_dev)
goto drop;
@@ -570,7 +577,6 @@ static int pim6_rcv(struct sk_buff *skb)
netif_rx(skb);
- dev_put(reg_dev);
return 0;
drop:
kfree_skb(skb);
@@ -600,11 +606,12 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
goto tx_err;
- read_lock(&mrt_lock);
dev->stats.tx_bytes += skb->len;
dev->stats.tx_packets++;
- ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT);
- read_unlock(&mrt_lock);
+ rcu_read_lock();
+ ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
+ MRT6MSG_WHOLEPKT);
+ rcu_read_unlock();
kfree_skb(skb);
return NETDEV_TX_OK;
@@ -670,10 +677,11 @@ failure:
static int call_ip6mr_vif_entry_notifiers(struct net *net,
enum fib_event_type event_type,
struct vif_device *vif,
+ struct net_device *vif_dev,
mifi_t vif_index, u32 tb_id)
{
return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
- vif, vif_index, tb_id,
+ vif, vif_dev, vif_index, tb_id,
&net->ipv6.ipmr_seq);
}
@@ -698,23 +706,21 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
v = &mrt->vif_table[vifi];
- if (VIF_EXISTS(mrt, vifi))
- call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
- FIB_EVENT_VIF_DEL, v, vifi,
- mrt->id);
-
- write_lock_bh(&mrt_lock);
- dev = v->dev;
- v->dev = NULL;
-
- if (!dev) {
- write_unlock_bh(&mrt_lock);
+ dev = rtnl_dereference(v->dev);
+ if (!dev)
return -EADDRNOTAVAIL;
- }
+
+ call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
+ FIB_EVENT_VIF_DEL, v, dev,
+ vifi, mrt->id);
+ spin_lock(&mrt_lock);
+ RCU_INIT_POINTER(v->dev, NULL);
#ifdef CONFIG_IPV6_PIMSM_V2
- if (vifi == mrt->mroute_reg_vif_num)
- mrt->mroute_reg_vif_num = -1;
+ if (vifi == mrt->mroute_reg_vif_num) {
+ /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, -1);
+ }
#endif
if (vifi + 1 == mrt->maxvif) {
@@ -723,10 +729,10 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
if (VIF_EXISTS(mrt, tmp))
break;
}
- mrt->maxvif = tmp + 1;
+ WRITE_ONCE(mrt->maxvif, tmp + 1);
}
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
dev_set_allmulti(dev, -1);
@@ -826,7 +832,7 @@ static void ipmr_expire_process(struct timer_list *t)
spin_unlock(&mfc_unres_lock);
}
-/* Fill oifs list. It is called under write locked mrt_lock. */
+/* Fill oifs list. It is called under locked mrt_lock. */
static void ip6mr_update_thresholds(struct mr_table *mrt,
struct mr_mfc *cache,
@@ -912,18 +918,18 @@ static int mif6_add(struct net *net, struct mr_table *mrt,
MIFF_REGISTER);
/* And finish update writing critical data */
- write_lock_bh(&mrt_lock);
- v->dev = dev;
+ spin_lock(&mrt_lock);
+ rcu_assign_pointer(v->dev, dev);
netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
#ifdef CONFIG_IPV6_PIMSM_V2
if (v->flags & MIFF_REGISTER)
- mrt->mroute_reg_vif_num = vifi;
+ WRITE_ONCE(mrt->mroute_reg_vif_num, vifi);
#endif
if (vifi + 1 > mrt->maxvif)
- mrt->maxvif = vifi + 1;
- write_unlock_bh(&mrt_lock);
+ WRITE_ONCE(mrt->maxvif, vifi + 1);
+ spin_unlock(&mrt_lock);
call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD,
- v, vifi, mrt->id);
+ v, dev, vifi, mrt->id);
return 0;
}
@@ -1028,10 +1034,10 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
/*
* Bounce a cache query up to pim6sd and netlink.
*
- * Called under mrt_lock.
+ * Called under rcu_read_lock()
*/
-static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
+static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt,
mifi_t mifi, int assert)
{
struct sock *mroute6_sk;
@@ -1072,7 +1078,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
if (assert == MRT6MSG_WRMIFWHOLE)
msg->im6_mif = mifi;
else
- msg->im6_mif = mrt->mroute_reg_vif_num;
+ msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num);
msg->im6_pad = 0;
msg->im6_src = ipv6_hdr(pkt)->saddr;
msg->im6_dst = ipv6_hdr(pkt)->daddr;
@@ -1107,10 +1113,8 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
- rcu_read_lock();
mroute6_sk = rcu_dereference(mrt->mroute_sk);
if (!mroute6_sk) {
- rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
}
@@ -1119,7 +1123,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
/* Deliver to user space multicast routing algorithms */
ret = sock_queue_rcv_skb(mroute6_sk, skb);
- rcu_read_unlock();
+
if (ret < 0) {
net_warn_ratelimited("mroute6: pending queue full, dropping entries\n");
kfree_skb(skb);
@@ -1243,7 +1247,7 @@ static int ip6mr_device_event(struct notifier_block *this,
ip6mr_for_each_table(mrt, net) {
v = &mrt->vif_table[0];
for (ct = 0; ct < mrt->maxvif; ct++, v++) {
- if (v->dev == dev)
+ if (rcu_access_pointer(v->dev) == dev)
mif6_delete(mrt, ct, 1, NULL);
}
}
@@ -1262,7 +1266,7 @@ static int ip6mr_dump(struct net *net, struct notifier_block *nb,
struct netlink_ext_ack *extack)
{
return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump,
- ip6mr_mr_table_iter, &mrt_lock, extack);
+ ip6mr_mr_table_iter, extack);
}
static struct notifier_block ip6_mr_notifier = {
@@ -1437,12 +1441,12 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
&mfc->mf6cc_mcastgrp.sin6_addr, parent);
rcu_read_unlock();
if (c) {
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
c->_c.mfc_parent = mfc->mf6cc_parent;
ip6mr_update_thresholds(mrt, &c->_c, ttls);
if (!mrtsock)
c->_c.mfc_flags |= MFC_STATIC;
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
c, mrt->id);
mr6_netlink_event(mrt, c, RTM_NEWROUTE);
@@ -1560,7 +1564,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
struct net *net = sock_net(sk);
rtnl_lock();
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
if (rtnl_dereference(mrt->mroute_sk)) {
err = -EADDRINUSE;
} else {
@@ -1568,7 +1572,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
sock_set_flag(sk, SOCK_RCU_FREE);
atomic_inc(&net->ipv6.devconf_all->mc_forwarding);
}
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
if (!err)
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -1598,14 +1602,14 @@ int ip6mr_sk_done(struct sock *sk)
rtnl_lock();
ip6mr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
RCU_INIT_POINTER(mrt->mroute_sk, NULL);
/* Note that mroute_sk had SOCK_RCU_FREE set,
* so the RCU grace period before sk freeing
* is guaranteed by sk_destruct()
*/
atomic_dec(&devconf->mc_forwarding);
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_MC_FORWARDING,
NETCONFA_IFINDEX_ALL,
@@ -1891,20 +1895,20 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
if (vr.mifi >= mrt->maxvif)
return -EINVAL;
vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
- read_lock(&mrt_lock);
+ rcu_read_lock();
vif = &mrt->vif_table[vr.mifi];
if (VIF_EXISTS(mrt, vr.mifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT_IN6:
if (copy_from_user(&sr, arg, sizeof(sr)))
@@ -1966,20 +1970,20 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
if (vr.mifi >= mrt->maxvif)
return -EINVAL;
vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
- read_lock(&mrt_lock);
+ rcu_read_lock();
vif = &mrt->vif_table[vr.mifi];
if (VIF_EXISTS(mrt, vr.mifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT_IN6:
if (copy_from_user(&sr, arg, sizeof(sr)))
@@ -2021,21 +2025,22 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct
static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
struct sk_buff *skb, int vifi)
{
- struct ipv6hdr *ipv6h;
struct vif_device *vif = &mrt->vif_table[vifi];
- struct net_device *dev;
+ struct net_device *vif_dev;
+ struct ipv6hdr *ipv6h;
struct dst_entry *dst;
struct flowi6 fl6;
- if (!vif->dev)
+ vif_dev = vif_dev_read(vif);
+ if (!vif_dev)
goto out_free;
#ifdef CONFIG_IPV6_PIMSM_V2
if (vif->flags & MIFF_REGISTER) {
- vif->pkt_out++;
- vif->bytes_out += skb->len;
- vif->dev->stats.tx_bytes += skb->len;
- vif->dev->stats.tx_packets++;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
+ vif_dev->stats.tx_bytes += skb->len;
+ vif_dev->stats.tx_packets++;
ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT);
goto out_free;
}
@@ -2068,14 +2073,13 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
* not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- dev = vif->dev;
- skb->dev = dev;
- vif->pkt_out++;
- vif->bytes_out += skb->len;
+ skb->dev = vif_dev;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
/* We are about to write */
/* XXX: extension headers? */
- if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev)))
+ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev)))
goto out_free;
ipv6h = ipv6_hdr(skb);
@@ -2084,7 +2088,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
IP6CB(skb)->flags |= IP6SKB_FORWARDED;
return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
- net, NULL, skb, skb->dev, dev,
+ net, NULL, skb, skb->dev, vif_dev,
ip6mr_forward2_finish);
out_free:
@@ -2092,17 +2096,20 @@ out_free:
return 0;
}
+/* Called with rcu_read_lock() */
static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev)
{
int ct;
- for (ct = mrt->maxvif - 1; ct >= 0; ct--) {
- if (mrt->vif_table[ct].dev == dev)
+ /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */
+ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) {
+ if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
break;
}
return ct;
}
+/* Called under rcu_read_lock() */
static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
struct net_device *dev, struct sk_buff *skb,
struct mfc6_cache *c)
@@ -2122,20 +2129,18 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
/* For an (*,G) entry, we only check that the incoming
* interface is part of the static tree.
*/
- rcu_read_lock();
cache_proxy = mr_mfc_find_any_parent(mrt, vif);
if (cache_proxy &&
cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) {
rcu_read_unlock();
goto forward;
}
- rcu_read_unlock();
}
/*
* Wrong interface: drop packet and (maybe) send PIM assert.
*/
- if (mrt->vif_table[vif].dev != dev) {
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
c->_c.mfc_un.res.wrong_if++;
if (true_vifi >= 0 && mrt->mroute_do_assert &&
@@ -2159,8 +2164,10 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
}
forward:
- mrt->vif_table[vif].pkt_in++;
- mrt->vif_table[vif].bytes_in += skb->len;
+ WRITE_ONCE(mrt->vif_table[vif].pkt_in,
+ mrt->vif_table[vif].pkt_in + 1);
+ WRITE_ONCE(mrt->vif_table[vif].bytes_in,
+ mrt->vif_table[vif].bytes_in + skb->len);
/*
* Forward the frame
@@ -2238,7 +2245,6 @@ int ip6_mr_input(struct sk_buff *skb)
return err;
}
- read_lock(&mrt_lock);
cache = ip6mr_cache_find(mrt,
&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
if (!cache) {
@@ -2259,19 +2265,15 @@ int ip6_mr_input(struct sk_buff *skb)
vif = ip6mr_find_vif(mrt, dev);
if (vif >= 0) {
int err = ip6mr_cache_unresolved(mrt, vif, skb, dev);
- read_unlock(&mrt_lock);
return err;
}
- read_unlock(&mrt_lock);
kfree_skb(skb);
return -ENODEV;
}
ip6_mr_forward(net, mrt, dev, skb, cache);
- read_unlock(&mrt_lock);
-
return 0;
}
@@ -2287,7 +2289,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
if (!mrt)
return -ENOENT;
- read_lock(&mrt_lock);
+ rcu_read_lock();
cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr);
if (!cache && skb->dev) {
int vif = ip6mr_find_vif(mrt, skb->dev);
@@ -2305,14 +2307,14 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
dev = skb->dev;
if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENODEV;
}
/* really correct? */
skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
if (!skb2) {
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENOMEM;
}
@@ -2335,13 +2337,13 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
iph->daddr = rt->rt6i_dst.addr;
err = ip6mr_cache_unresolved(mrt, vif, skb2, dev);
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return err;
}
err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return err;
}
@@ -2460,7 +2462,7 @@ static size_t mrt6msg_netlink_msgsize(size_t payloadlen)
return len;
}
-static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
+static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt)
{
struct net *net = read_pnet(&mrt->net);
struct nlmsghdr *nlh;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b0dfe97ea4ee..cd84cbdac0a2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -128,6 +128,7 @@ struct neigh_table nd_tbl = {
[NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER,
[NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME,
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
[NEIGH_VAR_PROXY_QLEN] = 64,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 46b560aacc11..722de9dd0ff7 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -332,7 +332,6 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
u8 type, u8 code, int inner_offset, __be32 info)
{
- const struct in6_addr *saddr, *daddr;
struct net *net = dev_net(skb->dev);
struct hlist_nulls_head *hlist;
struct hlist_nulls_node *hnode;
@@ -345,8 +344,6 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
sk_nulls_for_each(sk, hnode, hlist) {
/* Note: ipv6_hdr(skb) != skb->data */
const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;
- saddr = &ip6h->saddr;
- daddr = &ip6h->daddr;
if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr,
inet6_iif(skb), inet6_iif(skb)))
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0be01a4d48c1..70cd50c1fa6f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4569,8 +4569,15 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
}
f6i = ip6_route_info_create(&cfg, gfp_flags, NULL);
- if (!IS_ERR(f6i))
+ if (!IS_ERR(f6i)) {
f6i->dst_nocount = true;
+
+ if (!anycast &&
+ (net->ipv6.devconf_all->disable_policy ||
+ idev->cnf.disable_policy))
+ f6i->dst_nopolicy = true;
+ }
+
return f6i;
}
@@ -5934,7 +5941,7 @@ int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
rcu_read_unlock();
if (err)
- return count += w.count;
+ return count + w.count;
}
return -1;
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 6de01185cc68..d43c50a7310d 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -406,7 +406,6 @@ int __net_init seg6_hmac_net_init(struct net *net)
return rhashtable_init(&sdata->hmac_infos, &rht_params);
}
-EXPORT_SYMBOL(seg6_hmac_net_init);
void seg6_hmac_exit(void)
{
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index fab89fd978f0..6b73b7a5f175 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -323,8 +323,6 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u
kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) :
NULL;
- rcu_read_lock();
-
ca = min(t->prl_count, cmax);
if (!kp) {
@@ -341,7 +339,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u
}
}
- c = 0;
+ rcu_read_lock();
for_each_prl_rcu(t->prl) {
if (c >= cmax)
break;
@@ -353,7 +351,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u
if (kprl.addr != htonl(INADDR_ANY))
break;
}
-out:
+
rcu_read_unlock();
len = sizeof(*kp) * c;
@@ -362,7 +360,7 @@ out:
ret = -EFAULT;
kfree(kp);
-
+out:
return ret;
}
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 9d1aafe75f92..4595b56d175d 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -184,7 +184,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
session->pwtype == L2TP_PWTYPE_PPP ? "PPP" :
"");
if (session->send_seq || session->recv_seq)
- seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns);
+ seq_printf(m, " nr %u, ns %u\n", session->nr, session->ns);
seq_printf(m, " refcnt %d\n", refcount_read(&session->ref_count));
seq_printf(m, " config 0/0/%c/%c/-/%s %08x %u\n",
session->recv_seq ? 'R' : '-',
@@ -192,7 +192,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
session->lns_mode ? "LNS" : "LAC",
0,
jiffies_to_msecs(session->reorder_timeout));
- seq_printf(m, " offset 0 l2specific %hu/%hu\n",
+ seq_printf(m, " offset 0 l2specific %hu/%d\n",
session->l2specific_type, l2tp_get_l2specific_len(session));
if (session->cookie_len) {
seq_printf(m, " cookie %02x%02x%02x%02x",
@@ -215,7 +215,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
seq_puts(m, "\n");
}
- seq_printf(m, " %hu/%hu tx %ld/%ld/%ld rx %ld/%ld/%ld\n",
+ seq_printf(m, " %u/%u tx %ld/%ld/%ld rx %ld/%ld/%ld\n",
session->nr, session->ns,
atomic_long_read(&session->stats.tx_packets),
atomic_long_read(&session->stats.tx_bytes),
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 8be1fdc68a0b..db2e584c625e 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1553,7 +1553,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
session->lns_mode ? "LNS" : "LAC",
0,
jiffies_to_msecs(session->reorder_timeout));
- seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n",
+ seq_printf(m, " %u/%u %ld/%ld/%ld %ld/%ld/%ld\n",
session->nr, session->ns,
atomic_long_read(&session->stats.tx_packets),
atomic_long_read(&session->stats.tx_bytes),
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index be3b918a6d15..bd8f0f425be4 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -765,6 +765,7 @@ static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_bu
opts->suboptions |= OPTION_MPTCP_RST;
opts->reset_transient = subflow->reset_transient;
opts->reset_reason = subflow->reset_reason;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX);
return true;
}
@@ -788,6 +789,7 @@ static bool mptcp_established_options_fastclose(struct sock *sk,
opts->rcvr_key = msk->remote_key;
pr_debug("FASTCLOSE key=%llu", opts->rcvr_key);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX);
return true;
}
@@ -809,6 +811,7 @@ static bool mptcp_established_options_mp_fail(struct sock *sk,
opts->fail_seq = subflow->map_seq;
pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);
return true;
}
@@ -833,13 +836,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
*size += opt_size;
remaining -= opt_size;
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX);
}
/* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */
if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) {
*size += opt_size;
remaining -= opt_size;
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX);
}
return true;
}
@@ -966,7 +967,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
goto reset;
subflow->mp_capable = 0;
pr_fallback(msk);
- __mptcp_do_fallback(msk);
+ mptcp_do_fallback(ssk);
return false;
}
@@ -1583,6 +1584,9 @@ mp_rst:
*ptr++ = mptcp_option(MPTCPOPT_MP_PRIO,
TCPOLEN_MPTCP_PRIO,
opts->backup, TCPOPT_NOP);
+
+ MPTCP_INC_STATS(sock_net((const struct sock *)tp),
+ MPTCP_MIB_MPPRIOTX);
}
mp_capable_done:
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 59a85220edc9..45e2a48397b9 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -299,23 +299,21 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
- struct sock *s = (struct sock *)msk;
pr_debug("fail_seq=%llu", fail_seq);
if (!READ_ONCE(msk->allow_infinite_fallback))
return;
- if (!READ_ONCE(subflow->mp_fail_response_expect)) {
+ if (!subflow->fail_tout) {
pr_debug("send MP_FAIL response and infinite map");
subflow->send_mp_fail = 1;
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);
subflow->send_infinite_map = 1;
- } else if (!sock_flag(sk, SOCK_DEAD)) {
+ tcp_send_ack(sk);
+ } else {
pr_debug("MP_FAIL response received");
-
- sk_stop_timer(s, &s->sk_timer);
+ WRITE_ONCE(subflow->fail_tout, 0);
}
}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index e099f2a12504..5bdb559d5242 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -717,9 +717,10 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk)
}
}
-static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr,
- u8 bkup)
+int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ struct mptcp_addr_info *rem,
+ u8 bkup)
{
struct mptcp_subflow_context *subflow;
@@ -727,24 +728,29 @@ static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- struct sock *sk = (struct sock *)msk;
- struct mptcp_addr_info local;
+ struct mptcp_addr_info local, remote;
+ bool slow;
local_address((struct sock_common *)ssk, &local);
if (!mptcp_addresses_equal(&local, addr, addr->port))
continue;
+ if (rem && rem->family != AF_UNSPEC) {
+ remote_address((struct sock_common *)ssk, &remote);
+ if (!mptcp_addresses_equal(&remote, rem, rem->port))
+ continue;
+ }
+
+ slow = lock_sock_fast(ssk);
if (subflow->backup != bkup)
msk->last_snd = NULL;
subflow->backup = bkup;
subflow->send_mp_prio = 1;
subflow->request_bkup = bkup;
- __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIOTX);
- spin_unlock_bh(&msk->pm.lock);
pr_debug("send ack for mp_prio");
- mptcp_subflow_send_ack(ssk);
- spin_lock_bh(&msk->pm.lock);
+ __mptcp_subflow_send_ack(ssk);
+ unlock_sock_fast(ssk, slow);
return 0;
}
@@ -801,7 +807,8 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
removed = true;
__MPTCP_INC_STATS(sock_net(sk), rm_type);
}
- __set_bit(rm_list->ids[i], msk->pm.id_avail_bitmap);
+ if (rm_type == MPTCP_MIB_RMSUBFLOW)
+ __set_bit(rm_list->ids[i], msk->pm.id_avail_bitmap);
if (!removed)
continue;
@@ -1127,7 +1134,7 @@ void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ss
}
unlock_sock_fast(ssk, slow);
- /* always try to push the pending data regarless of re-injections:
+ /* always try to push the pending data regardless of re-injections:
* we can possibly use backup subflows now, and subflow selection
* is cheap under the msk socket lock
*/
@@ -1816,8 +1823,10 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk,
list.ids[list.nr++] = addr->id;
+ spin_lock_bh(&msk->pm.lock);
mptcp_pm_nl_rm_subflow_received(msk, &list);
mptcp_pm_create_subflow_or_signal_addr(msk);
+ spin_unlock_bh(&msk->pm.lock);
}
static int mptcp_nl_set_flags(struct net *net,
@@ -1835,12 +1844,10 @@ static int mptcp_nl_set_flags(struct net *net,
goto next;
lock_sock(sk);
- spin_lock_bh(&msk->pm.lock);
if (changed & MPTCP_PM_ADDR_FLAG_BACKUP)
- ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup);
+ ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup);
if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)
mptcp_pm_nl_fullmesh(msk, addr);
- spin_unlock_bh(&msk->pm.lock);
release_sock(sk);
next:
@@ -1854,6 +1861,9 @@ next:
static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
{
struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }, *entry;
+ struct mptcp_pm_addr_entry remote = { .addr = { .family = AF_UNSPEC }, };
+ struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
+ struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP |
@@ -1866,6 +1876,12 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
if (ret < 0)
return ret;
+ if (attr_rem) {
+ ret = mptcp_pm_parse_entry(attr_rem, info, false, &remote);
+ if (ret < 0)
+ return ret;
+ }
+
if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
bkup = 1;
if (addr.addr.family == AF_UNSPEC) {
@@ -1874,6 +1890,10 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
return -EOPNOTSUPP;
}
+ if (token)
+ return mptcp_userspace_pm_set_flags(sock_net(skb->sk),
+ token, &addr, &remote, bkup);
+
spin_lock_bh(&pernet->lock);
entry = __lookup_addr(pernet, &addr.addr, lookup_by_id);
if (!entry) {
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index f56378e4f597..9e82250cbb70 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -5,6 +5,7 @@
*/
#include "protocol.h"
+#include "mib.h"
void mptcp_free_local_addr_list(struct mptcp_sock *msk)
{
@@ -306,15 +307,11 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk,
const struct mptcp_addr_info *local,
const struct mptcp_addr_info *remote)
{
- struct sock *sk = &msk->sk.icsk_inet.sk;
struct mptcp_subflow_context *subflow;
- struct sock *found = NULL;
if (local->family != remote->family)
return NULL;
- lock_sock(sk);
-
mptcp_for_each_subflow(msk, subflow) {
const struct inet_sock *issk;
struct sock *ssk;
@@ -347,16 +344,11 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk,
}
if (issk->inet_sport == local->port &&
- issk->inet_dport == remote->port) {
- found = ssk;
- goto found;
- }
+ issk->inet_dport == remote->port)
+ return ssk;
}
-found:
- release_sock(sk);
-
- return found;
+ return NULL;
}
int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info)
@@ -412,18 +404,51 @@ int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info)
}
sk = &msk->sk.icsk_inet.sk;
+ lock_sock(sk);
ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r);
if (ssk) {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN);
mptcp_close_ssk(sk, ssk, subflow);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
err = 0;
} else {
err = -ESRCH;
}
+ release_sock(sk);
- destroy_err:
+destroy_err:
sock_put((struct sock *)msk);
return err;
}
+
+int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token,
+ struct mptcp_pm_addr_entry *loc,
+ struct mptcp_pm_addr_entry *rem, u8 bkup)
+{
+ struct mptcp_sock *msk;
+ int ret = -EINVAL;
+ u32 token_val;
+
+ token_val = nla_get_u32(token);
+
+ msk = mptcp_token_get_sock(net, token_val);
+ if (!msk)
+ return ret;
+
+ if (!mptcp_pm_is_userspace(msk))
+ goto set_flags_err;
+
+ if (loc->addr.family == AF_UNSPEC ||
+ rem->addr.family == AF_UNSPEC)
+ goto set_flags_err;
+
+ lock_sock((struct sock *)msk);
+ ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup);
+ release_sock((struct sock *)msk);
+
+set_flags_err:
+ sock_put((struct sock *)msk);
+ return ret;
+}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e0fb9f96c45c..2caad4a3adea 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -181,8 +181,8 @@ static void mptcp_rmem_uncharge(struct sock *sk, int size)
reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
/* see sk_mem_uncharge() for the rationale behind the following schema */
- if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD))
- __mptcp_rmem_reclaim(sk, SK_RECLAIM_CHUNK);
+ if (unlikely(reclaimable >= PAGE_SIZE))
+ __mptcp_rmem_reclaim(sk, reclaimable);
}
static void mptcp_rfree(struct sk_buff *skb)
@@ -323,20 +323,16 @@ static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
struct mptcp_sock *msk = mptcp_sk(sk);
int amt, amount;
- if (size < msk->rmem_fwd_alloc)
+ if (size <= msk->rmem_fwd_alloc)
return true;
+ size -= msk->rmem_fwd_alloc;
amt = sk_mem_pages(size);
amount = amt << PAGE_SHIFT;
- msk->rmem_fwd_alloc += amount;
- if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
- if (ssk->sk_forward_alloc < amount) {
- msk->rmem_fwd_alloc -= amount;
- return false;
- }
+ if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV))
+ return false;
- ssk->sk_forward_alloc -= amount;
- }
+ msk->rmem_fwd_alloc += amount;
return true;
}
@@ -500,19 +496,24 @@ static void mptcp_set_timeout(struct sock *sk)
__mptcp_set_timeout(sk, tout);
}
-static bool tcp_can_send_ack(const struct sock *ssk)
+static inline bool tcp_can_send_ack(const struct sock *ssk)
{
return !((1 << inet_sk_state_load(ssk)) &
(TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN));
}
+void __mptcp_subflow_send_ack(struct sock *ssk)
+{
+ if (tcp_can_send_ack(ssk))
+ tcp_send_ack(ssk);
+}
+
void mptcp_subflow_send_ack(struct sock *ssk)
{
bool slow;
slow = lock_sock_fast(ssk);
- if (tcp_can_send_ack(ssk))
- tcp_send_ack(ssk);
+ __mptcp_subflow_send_ack(ssk);
unlock_sock_fast(ssk, slow);
}
@@ -966,25 +967,6 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
df->data_seq + df->data_len == msk->write_seq;
}
-static void __mptcp_mem_reclaim_partial(struct sock *sk)
-{
- int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
-
- lockdep_assert_held_once(&sk->sk_lock.slock);
-
- if (reclaimable > (int)PAGE_SIZE)
- __mptcp_rmem_reclaim(sk, reclaimable - 1);
-
- sk_mem_reclaim(sk);
-}
-
-static void mptcp_mem_reclaim_partial(struct sock *sk)
-{
- mptcp_data_lock(sk);
- __mptcp_mem_reclaim_partial(sk);
- mptcp_data_unlock(sk);
-}
-
static void dfrag_uncharge(struct sock *sk, int len)
{
sk_mem_uncharge(sk, len);
@@ -1004,7 +986,6 @@ static void __mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
- bool cleaned = false;
u64 snd_una;
/* on fallback we just need to ignore snd_una, as this is really
@@ -1027,7 +1008,6 @@ static void __mptcp_clean_una(struct sock *sk)
}
dfrag_clear(sk, dfrag);
- cleaned = true;
}
dfrag = mptcp_rtx_head(sk);
@@ -1049,7 +1029,6 @@ static void __mptcp_clean_una(struct sock *sk)
dfrag->already_sent -= delta;
dfrag_uncharge(sk, delta);
- cleaned = true;
}
/* all retransmitted data acked, recovery completed */
@@ -1057,9 +1036,6 @@ static void __mptcp_clean_una(struct sock *sk)
msk->recovery = false;
out:
- if (cleaned && tcp_under_memory_pressure(sk))
- __mptcp_mem_reclaim_partial(sk);
-
if (snd_una == READ_ONCE(msk->snd_nxt) &&
snd_una == READ_ONCE(msk->write_seq)) {
if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
@@ -1211,12 +1187,6 @@ static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, boo
{
gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;
- if (unlikely(tcp_under_memory_pressure(sk))) {
- if (data_lock_held)
- __mptcp_mem_reclaim_partial(sk);
- else
- mptcp_mem_reclaim_partial(sk);
- }
return __mptcp_alloc_tx_skb(sk, ssk, gfp);
}
@@ -1245,7 +1215,7 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk,
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
pr_fallback(msk);
- __mptcp_do_fallback(msk);
+ mptcp_do_fallback(ssk);
}
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
@@ -2175,21 +2145,6 @@ static void mptcp_retransmit_timer(struct timer_list *t)
sock_put(sk);
}
-static struct mptcp_subflow_context *
-mp_fail_response_expect_subflow(struct mptcp_sock *msk)
-{
- struct mptcp_subflow_context *subflow, *ret = NULL;
-
- mptcp_for_each_subflow(msk, subflow) {
- if (READ_ONCE(subflow->mp_fail_response_expect)) {
- ret = subflow;
- break;
- }
- }
-
- return ret;
-}
-
static void mptcp_timeout_timer(struct timer_list *t)
{
struct sock *sk = from_timer(sk, t, sk_timer);
@@ -2346,6 +2301,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
kfree_rcu(subflow, rcu);
} else {
/* otherwise tcp will dispose of the ssk and subflow ctx */
+ if (ssk->sk_state == TCP_LISTEN) {
+ tcp_set_state(ssk, TCP_CLOSE);
+ mptcp_subflow_queue_clean(ssk);
+ inet_csk_listen_stop(ssk);
+ }
__tcp_close(ssk, 0);
/* close acquired an extra ref */
@@ -2518,27 +2478,50 @@ reset_timer:
mptcp_reset_timer(sk);
}
+/* schedule the timeout timer for the relevant event: either close timeout
+ * or mp_fail timeout. The close timeout takes precedence on the mp_fail one
+ */
+void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout)
+{
+ struct sock *sk = (struct sock *)msk;
+ unsigned long timeout, close_timeout;
+
+ if (!fail_tout && !sock_flag(sk, SOCK_DEAD))
+ return;
+
+ close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN;
+
+ /* the close timeout takes precedence on the fail one, and here at least one of
+ * them is active
+ */
+ timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout;
+
+ sk_reset_timer(sk, &sk->sk_timer, timeout);
+}
+
static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
{
- struct mptcp_subflow_context *subflow;
- struct sock *ssk;
+ struct sock *ssk = msk->first;
bool slow;
- subflow = mp_fail_response_expect_subflow(msk);
- if (subflow) {
- pr_debug("MP_FAIL doesn't respond, reset the subflow");
+ if (!ssk)
+ return;
+
+ pr_debug("MP_FAIL doesn't respond, reset the subflow");
- ssk = mptcp_subflow_tcp_sock(subflow);
- slow = lock_sock_fast(ssk);
- mptcp_subflow_reset(ssk);
- unlock_sock_fast(ssk, slow);
- }
+ slow = lock_sock_fast(ssk);
+ mptcp_subflow_reset(ssk);
+ WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
+ unlock_sock_fast(ssk, slow);
+
+ mptcp_reset_timeout(msk, 0);
}
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
struct sock *sk = &msk->sk.icsk_inet.sk;
+ unsigned long fail_tout;
int state;
lock_sock(sk);
@@ -2575,7 +2558,9 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
__mptcp_retrans(sk);
- mptcp_mp_fail_no_response(msk);
+ fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0;
+ if (fail_tout && time_after(jiffies, fail_tout))
+ mptcp_mp_fail_no_response(msk);
unlock:
release_sock(sk);
@@ -2822,6 +2807,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
static void mptcp_close(struct sock *sk, long timeout)
{
struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
bool do_cancel_work = false;
lock_sock(sk);
@@ -2840,10 +2826,16 @@ static void mptcp_close(struct sock *sk, long timeout)
cleanup:
/* orphan all the subflows */
inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
- mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
+ mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast_nested(ssk);
+ /* since the close timeout takes precedence on the fail one,
+ * cancel the latter
+ */
+ if (ssk == msk->first)
+ subflow->fail_tout = 0;
+
sock_orphan(ssk);
unlock_sock_fast(ssk, slow);
}
@@ -2852,13 +2844,13 @@ cleanup:
sock_hold(sk);
pr_debug("msk=%p state=%d", sk, sk->sk_state);
if (mptcp_sk(sk)->token)
- mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
+ mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
if (sk->sk_state == TCP_CLOSE) {
__mptcp_destroy_sock(sk);
do_cancel_work = true;
} else {
- sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN);
+ mptcp_reset_timeout(msk, 0);
}
release_sock(sk);
if (do_cancel_work)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 200f89f6d62f..07871e10e510 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -83,7 +83,6 @@
/* MPTCP MP_JOIN flags */
#define MPTCPOPT_BACKUP BIT(0)
-#define MPTCPOPT_HMAC_LEN 20
#define MPTCPOPT_THMAC_LEN 8
/* MPTCP MP_CAPABLE flags */
@@ -306,6 +305,7 @@ struct mptcp_sock {
u32 setsockopt_seq;
char ca_name[TCP_CA_NAME_MAX];
+ struct mptcp_sock *dl_next;
};
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
@@ -468,7 +468,6 @@ struct mptcp_subflow_context {
local_id_valid : 1, /* local_id is correctly initialized */
valid_csum_seen : 1; /* at least one csum validated */
enum mptcp_data_avail data_avail;
- bool mp_fail_response_expect;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
@@ -482,6 +481,7 @@ struct mptcp_subflow_context {
u8 stale_count;
long delegated_status;
+ unsigned long fail_tout;
);
@@ -606,8 +606,10 @@ void __init mptcp_subflow_init(void);
void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how);
void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow);
+void __mptcp_subflow_send_ack(struct sock *ssk);
void mptcp_subflow_send_ack(struct sock *ssk);
void mptcp_subflow_reset(struct sock *ssk);
+void mptcp_subflow_queue_clean(struct sock *ssk);
void mptcp_sock_graft(struct sock *sk, struct socket *parent);
struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
@@ -662,6 +664,7 @@ void mptcp_get_options(const struct sk_buff *skb,
void mptcp_finish_connect(struct sock *sk);
void __mptcp_set_connected(struct sock *sk);
+void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout);
static inline bool mptcp_is_fully_established(struct sock *sk)
{
return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
@@ -768,6 +771,10 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list);
void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup);
void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq);
+int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ struct mptcp_addr_info *rem,
+ u8 bkup);
bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
const struct mptcp_pm_addr_entry *entry);
void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
@@ -784,7 +791,9 @@ int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
unsigned int id,
u8 *flags, int *ifindex);
-
+int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token,
+ struct mptcp_pm_addr_entry *loc,
+ struct mptcp_pm_addr_entry *rem, u8 bkup);
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool echo);
@@ -926,12 +935,25 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}
-static inline void mptcp_do_fallback(struct sock *sk)
+static inline void mptcp_do_fallback(struct sock *ssk)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+ struct mptcp_sock *msk;
+ msk = mptcp_sk(sk);
__mptcp_do_fallback(msk);
+ if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) {
+ gfp_t saved_allocation = ssk->sk_allocation;
+
+ /* we are in a atomic (BH) scope, override ssk default for data
+ * fin allocation
+ */
+ ssk->sk_allocation = GFP_ATOMIC;
+ ssk->sk_shutdown |= SEND_SHUTDOWN;
+ tcp_shutdown(ssk, SEND_SHUTDOWN);
+ ssk->sk_allocation = saved_allocation;
+ }
}
#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 8841e8cd9ad8..d4b16d033978 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -843,7 +843,8 @@ enum mapping_status {
MAPPING_INVALID,
MAPPING_EMPTY,
MAPPING_DATA_FIN,
- MAPPING_DUMMY
+ MAPPING_DUMMY,
+ MAPPING_BAD_CSUM
};
static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
@@ -958,11 +959,7 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *
subflow->map_data_csum);
if (unlikely(csum)) {
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR);
- if (subflow->mp_join || subflow->valid_csum_seen) {
- subflow->send_mp_fail = 1;
- MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX);
- }
- return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY;
+ return MAPPING_BAD_CSUM;
}
subflow->valid_csum_seen = 1;
@@ -974,7 +971,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
bool csum_reqd = READ_ONCE(msk->csum_enabled);
- struct sock *sk = (struct sock *)msk;
struct mptcp_ext *mpext;
struct sk_buff *skb;
u16 data_len;
@@ -1016,9 +1012,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
pr_debug("infinite mapping received");
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
subflow->map_data_len = 0;
- if (!sock_flag(ssk, SOCK_DEAD))
- sk_stop_timer(sk, &sk->sk_timer);
-
return MAPPING_INVALID;
}
@@ -1165,6 +1158,33 @@ static bool subflow_can_fallback(struct mptcp_subflow_context *subflow)
return !subflow->fully_established;
}
+static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ unsigned long fail_tout;
+
+ /* greceful failure can happen only on the MPC subflow */
+ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
+ return;
+
+ /* since the close timeout take precedence on the fail one,
+ * no need to start the latter when the first is already set
+ */
+ if (sock_flag((struct sock *)msk, SOCK_DEAD))
+ return;
+
+ /* we don't need extreme accuracy here, use a zero fail_tout as special
+ * value meaning no fail timeout at all;
+ */
+ fail_tout = jiffies + TCP_RTO_MAX;
+ if (!fail_tout)
+ fail_tout = 1;
+ WRITE_ONCE(subflow->fail_tout, fail_tout);
+ tcp_send_ack(ssk);
+
+ mptcp_reset_timeout(msk, subflow->fail_tout);
+}
+
static bool subflow_check_data_avail(struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
@@ -1184,10 +1204,8 @@ static bool subflow_check_data_avail(struct sock *ssk)
status = get_mapping_status(ssk, msk);
trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue));
- if (unlikely(status == MAPPING_INVALID))
- goto fallback;
-
- if (unlikely(status == MAPPING_DUMMY))
+ if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY ||
+ status == MAPPING_BAD_CSUM))
goto fallback;
if (status != MAPPING_OK)
@@ -1229,22 +1247,17 @@ no_data:
fallback:
if (!__mptcp_check_fallback(msk)) {
/* RFC 8684 section 3.7. */
- if (subflow->send_mp_fail) {
+ if (status == MAPPING_BAD_CSUM &&
+ (subflow->mp_join || subflow->valid_csum_seen)) {
+ subflow->send_mp_fail = 1;
+
if (!READ_ONCE(msk->allow_infinite_fallback)) {
- ssk->sk_err = EBADMSG;
- tcp_set_state(ssk, TCP_CLOSE);
subflow->reset_transient = 0;
subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
- tcp_send_active_reset(ssk, GFP_ATOMIC);
- while ((skb = skb_peek(&ssk->sk_receive_queue)))
- sk_eat_skb(ssk, skb);
- } else if (!sock_flag(ssk, SOCK_DEAD)) {
- WRITE_ONCE(subflow->mp_fail_response_expect, true);
- sk_reset_timer((struct sock *)msk,
- &((struct sock *)msk)->sk_timer,
- jiffies + TCP_RTO_MAX);
+ goto reset;
}
- WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
+ mptcp_subflow_fail(msk, ssk);
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
return true;
}
@@ -1252,16 +1265,20 @@ fallback:
/* fatal protocol error, close the socket.
* subflow_error_report() will introduce the appropriate barriers
*/
- ssk->sk_err = EBADMSG;
- tcp_set_state(ssk, TCP_CLOSE);
subflow->reset_transient = 0;
subflow->reset_reason = MPTCP_RST_EMPTCP;
+
+reset:
+ ssk->sk_err = EBADMSG;
+ tcp_set_state(ssk, TCP_CLOSE);
+ while ((skb = skb_peek(&ssk->sk_receive_queue)))
+ sk_eat_skb(ssk, skb);
tcp_send_active_reset(ssk, GFP_ATOMIC);
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
return false;
}
- __mptcp_do_fallback(msk);
+ mptcp_do_fallback(ssk);
}
skb = skb_peek(&ssk->sk_receive_queue);
@@ -1617,7 +1634,7 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
/* the newly created socket really belongs to the owning MPTCP master
* socket, even if for additional subflows the allocation is performed
* by a kernel workqueue. Adjust inode references, so that the
- * procfs/diag interaces really show this one belonging to the correct
+ * procfs/diag interfaces really show this one belonging to the correct
* user.
*/
SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
@@ -1706,6 +1723,58 @@ static void subflow_state_change(struct sock *sk)
}
}
+void mptcp_subflow_queue_clean(struct sock *listener_ssk)
+{
+ struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
+ struct mptcp_sock *msk, *next, *head = NULL;
+ struct request_sock *req;
+
+ /* build a list of all unaccepted mptcp sockets */
+ spin_lock_bh(&queue->rskq_lock);
+ for (req = queue->rskq_accept_head; req; req = req->dl_next) {
+ struct mptcp_subflow_context *subflow;
+ struct sock *ssk = req->sk;
+ struct mptcp_sock *msk;
+
+ if (!sk_is_mptcp(ssk))
+ continue;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ if (!subflow || !subflow->conn)
+ continue;
+
+ /* skip if already in list */
+ msk = mptcp_sk(subflow->conn);
+ if (msk->dl_next || msk == head)
+ continue;
+
+ msk->dl_next = head;
+ head = msk;
+ }
+ spin_unlock_bh(&queue->rskq_lock);
+ if (!head)
+ return;
+
+ /* can't acquire the msk socket lock under the subflow one,
+ * or will cause ABBA deadlock
+ */
+ release_sock(listener_ssk);
+
+ for (msk = head; msk; msk = next) {
+ struct sock *sk = (struct sock *)msk;
+ bool slow;
+
+ slow = lock_sock_fast_nested(sk);
+ next = msk->dl_next;
+ msk->first = NULL;
+ msk->dl_next = NULL;
+ unlock_sock_fast(sk, slow);
+ }
+
+ /* we are still under the listener msk socket lock */
+ lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
+}
+
static int subflow_ulp_init(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 78814417d753..80713febfac6 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -1803,7 +1803,8 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
pdev = to_platform_device(dev->dev.parent);
if (pdev) {
np = pdev->dev.of_node;
- if (np && of_get_property(np, "mlx,multi-host", NULL))
+ if (np && (of_get_property(np, "mellanox,multi-host", NULL) ||
+ of_get_property(np, "mlx,multi-host", NULL)))
ndp->mlx_multi_host = true;
}
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 7873bd1389c3..a8e2425e43b0 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -13,14 +13,31 @@
#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
-static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
+#define NF_RECURSION_LIMIT 2
+
+static DEFINE_PER_CPU(u8, nf_dup_skb_recursion);
+
+static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
+ enum nf_dev_hooks hook)
{
- if (skb_mac_header_was_set(skb))
+ if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT)
+ goto err;
+
+ if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
+ if (skb_cow_head(skb, skb->mac_len))
+ goto err;
+
skb_push(skb, skb->mac_len);
+ }
skb->dev = dev;
skb_clear_tstamp(skb);
+ __this_cpu_inc(nf_dup_skb_recursion);
dev_queue_xmit(skb);
+ __this_cpu_dec(nf_dup_skb_recursion);
+ return;
+err:
+ kfree_skb(skb);
}
void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
@@ -33,7 +50,7 @@ void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
return;
}
- nf_do_netdev_egress(pkt->skb, dev);
+ nf_do_netdev_egress(pkt->skb, dev, nft_hook(pkt));
}
EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress);
@@ -48,7 +65,7 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
skb = skb_clone(pkt->skb, GFP_ATOMIC);
if (skb)
- nf_do_netdev_egress(skb, dev);
+ nf_do_netdev_egress(skb, dev, nft_hook(pkt));
}
EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 51144fc66889..d6b59beab3a9 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5213,13 +5213,20 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
struct nft_data *data,
struct nlattr *attr)
{
+ u32 dtype;
int err;
err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr);
if (err < 0)
return err;
- if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) {
+ if (set->dtype == NFT_DATA_VERDICT)
+ dtype = NFT_DATA_VERDICT;
+ else
+ dtype = NFT_DATA_VALUE;
+
+ if (dtype != desc->type ||
+ set->dlen != desc->len) {
nft_data_release(data, desc->type);
return -EINVAL;
}
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 53f40e473855..3ddce24ac76d 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -25,9 +25,7 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
const struct nft_chain *chain,
enum nft_trace_types type)
{
- const struct nft_pktinfo *pkt = info->pkt;
-
- if (!info->trace || !pkt->skb->nf_trace)
+ if (!info->trace || !info->nf_trace)
return;
info->chain = chain;
@@ -42,11 +40,24 @@ static inline void nft_trace_packet(struct nft_traceinfo *info,
enum nft_trace_types type)
{
if (static_branch_unlikely(&nft_trace_enabled)) {
+ const struct nft_pktinfo *pkt = info->pkt;
+
+ info->nf_trace = pkt->skb->nf_trace;
info->rule = rule;
__nft_trace_packet(info, chain, type);
}
}
+static inline void nft_trace_copy_nftrace(struct nft_traceinfo *info)
+{
+ if (static_branch_unlikely(&nft_trace_enabled)) {
+ const struct nft_pktinfo *pkt = info->pkt;
+
+ if (info->trace)
+ info->nf_trace = pkt->skb->nf_trace;
+ }
+}
+
static void nft_bitwise_fast_eval(const struct nft_expr *expr,
struct nft_regs *regs)
{
@@ -85,6 +96,7 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
const struct nft_chain *chain,
const struct nft_regs *regs)
{
+ const struct nft_pktinfo *pkt = info->pkt;
enum nft_trace_types type;
switch (regs->verdict.code) {
@@ -92,8 +104,13 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
case NFT_RETURN:
type = NFT_TRACETYPE_RETURN;
break;
+ case NF_STOLEN:
+ type = NFT_TRACETYPE_RULE;
+ /* can't access skb->nf_trace; use copy */
+ break;
default:
type = NFT_TRACETYPE_RULE;
+ info->nf_trace = pkt->skb->nf_trace;
break;
}
@@ -254,6 +271,7 @@ next_rule:
switch (regs.verdict.code) {
case NFT_BREAK:
regs.verdict.code = NFT_CONTINUE;
+ nft_trace_copy_nftrace(&info);
continue;
case NFT_CONTINUE:
nft_trace_packet(&info, chain, rule,
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 5041725423c2..1163ba9c1401 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -7,7 +7,7 @@
#include <linux/module.h>
#include <linux/static_key.h>
#include <linux/hash.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/skbuff.h>
@@ -25,22 +25,6 @@
DEFINE_STATIC_KEY_FALSE(nft_trace_enabled);
EXPORT_SYMBOL_GPL(nft_trace_enabled);
-static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb)
-{
- __be32 id;
-
- /* using skb address as ID results in a limited number of
- * values (and quick reuse).
- *
- * So we attempt to use as many skb members that will not
- * change while skb is with netfilter.
- */
- id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb),
- skb->skb_iif);
-
- return nla_put_be32(nlskb, NFTA_TRACE_ID, id);
-}
-
static int trace_fill_header(struct sk_buff *nlskb, u16 type,
const struct sk_buff *skb,
int off, unsigned int len)
@@ -186,6 +170,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
struct nlmsghdr *nlh;
struct sk_buff *skb;
unsigned int size;
+ u32 mark = 0;
u16 event;
if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
@@ -229,7 +214,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type)))
goto nla_put_failure;
- if (trace_fill_id(skb, pkt->skb))
+ if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name))
@@ -249,16 +234,24 @@ void nft_trace_notify(struct nft_traceinfo *info)
case NFT_TRACETYPE_RULE:
if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict))
goto nla_put_failure;
+
+ /* pkt->skb undefined iff NF_STOLEN, disable dump */
+ if (info->verdict->code == NF_STOLEN)
+ info->packet_dumped = true;
+ else
+ mark = pkt->skb->mark;
+
break;
case NFT_TRACETYPE_POLICY:
+ mark = pkt->skb->mark;
+
if (nla_put_be32(skb, NFTA_TRACE_POLICY,
htonl(info->basechain->policy)))
goto nla_put_failure;
break;
}
- if (pkt->skb->mark &&
- nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark)))
+ if (mark && nla_put_be32(skb, NFTA_TRACE_MARK, htonl(mark)))
goto nla_put_failure;
if (!info->packet_dumped) {
@@ -283,9 +276,20 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
const struct nft_verdict *verdict,
const struct nft_chain *chain)
{
+ static siphash_key_t trace_key __read_mostly;
+ struct sk_buff *skb = pkt->skb;
+
info->basechain = nft_base_chain(chain);
info->trace = true;
+ info->nf_trace = pkt->skb->nf_trace;
info->packet_dumped = false;
info->pkt = pkt;
info->verdict = verdict;
+
+ net_get_random_once(&trace_key, sizeof(trace_key));
+
+ info->skbid = (u32)siphash_3u32(hash32_ptr(skb),
+ skb_get_hash(skb),
+ skb->skb_iif,
+ &trace_key);
}
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index af15102bc696..f466af4f8531 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -614,7 +614,7 @@ static void __net_exit cttimeout_net_exit(struct net *net)
nf_ct_untimeout(net, NULL);
- list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, head) {
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) {
list_del(&cur->free_head);
if (refcount_dec_and_test(&cur->refcnt))
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index ac4859241e17..55d2d49c3425 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -14,6 +14,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/random.h>
#include <linux/smp.h>
#include <linux/static_key.h>
#include <net/dst.h>
@@ -32,8 +33,6 @@
#define NFT_META_SECS_PER_DAY 86400
#define NFT_META_DAYS_PER_WEEK 7
-static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
-
static u8 nft_meta_weekday(void)
{
time64_t secs = ktime_get_real_seconds();
@@ -271,13 +270,6 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest,
return true;
}
-static noinline u32 nft_prandom_u32(void)
-{
- struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
-
- return prandom_u32_state(state);
-}
-
#ifdef CONFIG_IP_ROUTE_CLASSID
static noinline bool
nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest)
@@ -389,7 +381,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
break;
#endif
case NFT_META_PRANDOM:
- *dest = nft_prandom_u32();
+ *dest = get_random_u32();
break;
#ifdef CONFIG_XFRM
case NFT_META_SECPATH:
@@ -518,7 +510,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
len = IFNAMSIZ;
break;
case NFT_META_PRANDOM:
- prandom_init_once(&nft_prandom_state);
len = sizeof(u32);
break;
#ifdef CONFIG_XFRM
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 81b40c663d86..45d3dc9e96f2 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -9,12 +9,11 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <linux/random.h>
#include <linux/static_key.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
-static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
-
struct nft_ng_inc {
u8 dreg;
u32 modulus;
@@ -135,12 +134,9 @@ struct nft_ng_random {
u32 offset;
};
-static u32 nft_ng_random_gen(struct nft_ng_random *priv)
+static u32 nft_ng_random_gen(const struct nft_ng_random *priv)
{
- struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
-
- return reciprocal_scale(prandom_u32_state(state), priv->modulus) +
- priv->offset;
+ return reciprocal_scale(get_random_u32(), priv->modulus) + priv->offset;
}
static void nft_ng_random_eval(const struct nft_expr *expr,
@@ -168,8 +164,6 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- prandom_init_once(&nft_numgen_prandom_state);
-
return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg,
NULL, NFT_DATA_VALUE, sizeof(u32));
}
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index df40314de21f..76de6c8d9865 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -143,6 +143,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
/* Another cpu may race to insert the element with the same key */
if (prev) {
nft_set_elem_destroy(set, he, true);
+ atomic_dec(&set->nelems);
he = prev;
}
@@ -152,6 +153,7 @@ out:
err2:
nft_set_elem_destroy(set, he, true);
+ atomic_dec(&set->nelems);
err1:
return false;
}
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 2c8051d8cca6..4f9299b9dcdd 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -2125,6 +2125,32 @@ out_scratch:
}
/**
+ * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array
+ * @set: nftables API set representation
+ * @m: matching data pointing to key mapping array
+ */
+static void nft_set_pipapo_match_destroy(const struct nft_set *set,
+ struct nft_pipapo_match *m)
+{
+ struct nft_pipapo_field *f;
+ int i, r;
+
+ for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
+ ;
+
+ for (r = 0; r < f->rules; r++) {
+ struct nft_pipapo_elem *e;
+
+ if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
+ continue;
+
+ e = f->mt[r].e;
+
+ nft_set_elem_destroy(set, e, true);
+ }
+}
+
+/**
* nft_pipapo_destroy() - Free private data for set and all committed elements
* @set: nftables API set representation
*/
@@ -2132,26 +2158,13 @@ static void nft_pipapo_destroy(const struct nft_set *set)
{
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m;
- struct nft_pipapo_field *f;
- int i, r, cpu;
+ int cpu;
m = rcu_dereference_protected(priv->match, true);
if (m) {
rcu_barrier();
- for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
- ;
-
- for (r = 0; r < f->rules; r++) {
- struct nft_pipapo_elem *e;
-
- if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
- continue;
-
- e = f->mt[r].e;
-
- nft_set_elem_destroy(set, e, true);
- }
+ nft_set_pipapo_match_destroy(set, m);
#ifdef NFT_PIPAPO_ALIGN
free_percpu(m->scratch_aligned);
@@ -2165,6 +2178,11 @@ static void nft_pipapo_destroy(const struct nft_set *set)
}
if (priv->clone) {
+ m = priv->clone;
+
+ if (priv->dirty)
+ nft_set_pipapo_match_destroy(set, m);
+
#ifdef NFT_PIPAPO_ALIGN
free_percpu(priv->clone->scratch_aligned);
#endif
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 372bf54a0ca9..e20d1a973417 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -407,7 +407,7 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
if (flags & IP6_FH_F_FRAG) {
if (frag_off) {
key->ip.frag = OVS_FRAG_TYPE_LATER;
- key->ip.proto = nexthdr;
+ key->ip.proto = NEXTHDR_FRAGMENT;
return 0;
}
key->ip.frag = OVS_FRAG_TYPE_FIRST;
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index fee6409c2bb3..eb0b8197ac82 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -227,8 +227,8 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh)
{
struct rose_neigh *s;
- rose_stop_ftimer(rose_neigh);
- rose_stop_t0timer(rose_neigh);
+ del_timer_sync(&rose_neigh->ftimer);
+ del_timer_sync(&rose_neigh->t0timer);
skb_queue_purge(&rose_neigh->queue);
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index b3138fc2e552..f06ddbed3fed 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -31,89 +31,89 @@ static void rose_idletimer_expiry(struct timer_list *);
void rose_start_heartbeat(struct sock *sk)
{
- del_timer(&sk->sk_timer);
+ sk_stop_timer(sk, &sk->sk_timer);
sk->sk_timer.function = rose_heartbeat_expiry;
sk->sk_timer.expires = jiffies + 5 * HZ;
- add_timer(&sk->sk_timer);
+ sk_reset_timer(sk, &sk->sk_timer, sk->sk_timer.expires);
}
void rose_start_t1timer(struct sock *sk)
{
struct rose_sock *rose = rose_sk(sk);
- del_timer(&rose->timer);
+ sk_stop_timer(sk, &rose->timer);
rose->timer.function = rose_timer_expiry;
rose->timer.expires = jiffies + rose->t1;
- add_timer(&rose->timer);
+ sk_reset_timer(sk, &rose->timer, rose->timer.expires);
}
void rose_start_t2timer(struct sock *sk)
{
struct rose_sock *rose = rose_sk(sk);
- del_timer(&rose->timer);
+ sk_stop_timer(sk, &rose->timer);
rose->timer.function = rose_timer_expiry;
rose->timer.expires = jiffies + rose->t2;
- add_timer(&rose->timer);
+ sk_reset_timer(sk, &rose->timer, rose->timer.expires);
}
void rose_start_t3timer(struct sock *sk)
{
struct rose_sock *rose = rose_sk(sk);
- del_timer(&rose->timer);
+ sk_stop_timer(sk, &rose->timer);
rose->timer.function = rose_timer_expiry;
rose->timer.expires = jiffies + rose->t3;
- add_timer(&rose->timer);
+ sk_reset_timer(sk, &rose->timer, rose->timer.expires);
}
void rose_start_hbtimer(struct sock *sk)
{
struct rose_sock *rose = rose_sk(sk);
- del_timer(&rose->timer);
+ sk_stop_timer(sk, &rose->timer);
rose->timer.function = rose_timer_expiry;
rose->timer.expires = jiffies + rose->hb;
- add_timer(&rose->timer);
+ sk_reset_timer(sk, &rose->timer, rose->timer.expires);
}
void rose_start_idletimer(struct sock *sk)
{
struct rose_sock *rose = rose_sk(sk);
- del_timer(&rose->idletimer);
+ sk_stop_timer(sk, &rose->idletimer);
if (rose->idle > 0) {
rose->idletimer.function = rose_idletimer_expiry;
rose->idletimer.expires = jiffies + rose->idle;
- add_timer(&rose->idletimer);
+ sk_reset_timer(sk, &rose->idletimer, rose->idletimer.expires);
}
}
void rose_stop_heartbeat(struct sock *sk)
{
- del_timer(&sk->sk_timer);
+ sk_stop_timer(sk, &sk->sk_timer);
}
void rose_stop_timer(struct sock *sk)
{
- del_timer(&rose_sk(sk)->timer);
+ sk_stop_timer(sk, &rose_sk(sk)->timer);
}
void rose_stop_idletimer(struct sock *sk)
{
- del_timer(&rose_sk(sk)->idletimer);
+ sk_stop_timer(sk, &rose_sk(sk)->idletimer);
}
static void rose_heartbeat_expiry(struct timer_list *t)
@@ -130,6 +130,7 @@ static void rose_heartbeat_expiry(struct timer_list *t)
(sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) {
bh_unlock_sock(sk);
rose_destroy_socket(sk);
+ sock_put(sk);
return;
}
break;
@@ -152,6 +153,7 @@ static void rose_heartbeat_expiry(struct timer_list *t)
rose_start_heartbeat(sk);
bh_unlock_sock(sk);
+ sock_put(sk);
}
static void rose_timer_expiry(struct timer_list *t)
@@ -181,6 +183,7 @@ static void rose_timer_expiry(struct timer_list *t)
break;
}
bh_unlock_sock(sk);
+ sock_put(sk);
}
static void rose_idletimer_expiry(struct timer_list *t)
@@ -205,4 +208,5 @@ static void rose_idletimer_expiry(struct timer_list *t)
sock_set_flag(sk, SOCK_DEAD);
}
bh_unlock_sock(sk);
+ sock_put(sk);
}
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 08aab5c01437..258917a714c8 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -431,7 +431,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
break;
}
- _leave(" = %d [set %hx]", ret, y);
+ _leave(" = %d [set %x]", ret, y);
return ret;
}
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index da9733da9868..817065aa2833 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -588,7 +588,8 @@ static int tcf_idr_release_unsafe(struct tc_action *p)
}
static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
- const struct tc_action_ops *ops)
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
{
struct nlattr *nest;
int n_i = 0;
@@ -604,20 +605,25 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
if (nla_put_string(skb, TCA_KIND, ops->kind))
goto nla_put_failure;
+ ret = 0;
mutex_lock(&idrinfo->lock);
idr_for_each_entry_ul(idr, p, tmp, id) {
if (IS_ERR(p))
continue;
ret = tcf_idr_release_unsafe(p);
- if (ret == ACT_P_DELETED) {
+ if (ret == ACT_P_DELETED)
module_put(ops->owner);
- n_i++;
- } else if (ret < 0) {
- mutex_unlock(&idrinfo->lock);
- goto nla_put_failure;
- }
+ else if (ret < 0)
+ break;
+ n_i++;
}
mutex_unlock(&idrinfo->lock);
+ if (ret < 0) {
+ if (n_i)
+ NL_SET_ERR_MSG(extack, "Unable to flush all TC actions");
+ else
+ goto nla_put_failure;
+ }
ret = nla_put_u32(skb, TCA_FCNT, n_i);
if (ret)
@@ -638,7 +644,7 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
struct tcf_idrinfo *idrinfo = tn->idrinfo;
if (type == RTM_DELACTION) {
- return tcf_del_walker(idrinfo, skb, ops);
+ return tcf_del_walker(idrinfo, skb, ops, extack);
} else if (type == RTM_GETACTION) {
return tcf_dump_walker(idrinfo, skb, cb);
} else {
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 79c8901f66ab..b759628a47c2 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -442,7 +442,7 @@ static int tcf_police_act_to_flow_act(int tc_act, u32 *extval,
act_id = FLOW_ACTION_JUMP;
*extval = tc_act & TC_ACT_EXT_VAL_MASK;
} else if (tc_act == TC_ACT_UNSPEC) {
- NL_SET_ERR_MSG_MOD(extack, "Offload not supported when conform/exceed action is \"continue\"");
+ act_id = FLOW_ACTION_CONTINUE;
} else {
NL_SET_ERR_MSG_MOD(extack, "Unsupported conform/exceed action offload");
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index ed4ccef5d6a8..5449ed114e40 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1146,9 +1146,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_netem_rate rate;
struct tc_netem_slot slot;
- qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
+ qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
UINT_MAX);
- qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
+ qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
UINT_MAX);
qopt.limit = q->limit;
qopt.loss = q->loss;
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index b9c71a304d39..0b941dd63d26 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
+#include <linux/time.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -176,7 +177,7 @@ static ktime_t get_interval_end_time(struct sched_gate_list *sched,
static int length_to_duration(struct taprio_sched *q, int len)
{
- return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
+ return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC);
}
/* Returns the entry corresponding to next available interval. If
@@ -551,7 +552,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
{
atomic_set(&entry->budget,
- div64_u64((u64)entry->interval * 1000,
+ div64_u64((u64)entry->interval * PSEC_PER_NSEC,
atomic64_read(&q->picos_per_byte)));
}
diff --git a/net/socket.c b/net/socket.c
index 1b6f5e2ebef5..3d7eb2a79e82 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2146,10 +2146,13 @@ SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
struct sockaddr __user *addr, int __user *addr_len)
{
+ struct sockaddr_storage address;
+ struct msghdr msg = {
+ /* Save some cycles and don't copy the address if not needed */
+ .msg_name = addr ? (struct sockaddr *)&address : NULL,
+ };
struct socket *sock;
struct iovec iov;
- struct msghdr msg;
- struct sockaddr_storage address;
int err, err2;
int fput_needed;
@@ -2160,14 +2163,6 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
if (!sock)
goto out;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- /* Save some cycles and don't copy the address if not needed */
- msg.msg_name = addr ? (struct sockaddr *)&address : NULL;
- /* We assume all kernel code knows the size of sockaddr_storage */
- msg.msg_namelen = 0;
- msg.msg_iocb = NULL;
- msg.msg_flags = 0;
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
err = sock_recvmsg(sock, &msg, flags);
@@ -2372,6 +2367,7 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
return -EFAULT;
kmsg->msg_control_is_user = true;
+ kmsg->msg_get_inq = 0;
kmsg->msg_control_user = msg.msg_control;
kmsg->msg_controllen = msg.msg_controllen;
kmsg->msg_flags = msg.msg_flags;
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 1a72c67afed5..8299ceb3e373 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -533,6 +533,9 @@ EXPORT_SYMBOL_GPL(strp_check_rcv);
static int __init strp_dev_init(void)
{
+ BUILD_BUG_ON(sizeof(struct sk_skb_cb) >
+ sizeof_field(struct sk_buff, cb));
+
strp_wq = create_singlethread_workqueue("kstrp");
if (unlikely(!strp_wq))
return -ENOMEM;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e2c6eca0271b..b6781ada3aa8 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -651,6 +651,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
new->cl_discrtry = clnt->cl_discrtry;
new->cl_chatty = clnt->cl_chatty;
new->cl_principal = clnt->cl_principal;
+ new->cl_max_connect = clnt->cl_max_connect;
return new;
out_err:
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index f87a2d8f23a7..5d2b3e6979fb 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -984,7 +984,7 @@ static noinline __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
p = page_address(*xdr->page_ptr);
xdr->p = p + frag2bytes;
space_left = xdr->buf->buflen - xdr->buf->len;
- if (space_left - nbytes >= PAGE_SIZE)
+ if (space_left - frag1bytes >= PAGE_SIZE)
xdr->end = p + PAGE_SIZE;
else
xdr->end = p + space_left - frag1bytes;
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 3f4542e0f065..434e70eabe08 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -109,10 +109,9 @@ static void __net_exit tipc_exit_net(struct net *net)
struct tipc_net *tn = tipc_net(net);
tipc_detach_loopback(net);
+ tipc_net_stop(net);
/* Make sure the tipc_net_finalize_work() finished */
cancel_work_sync(&tn->work);
- tipc_net_stop(net);
-
tipc_bcast_stop(net);
tipc_nametbl_stop(net);
tipc_sk_rht_destroy(net);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 6ef95ce565bd..b48d97cbbe29 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -472,8 +472,8 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id,
bool preliminary)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *l, *snd_l = tipc_bc_sndlink(net);
struct tipc_node *n, *temp_node;
- struct tipc_link *l;
unsigned long intv;
int bearer_id;
int i;
@@ -488,6 +488,16 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id,
goto exit;
/* A preliminary node becomes "real" now, refresh its data */
tipc_node_write_lock(n);
+ if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX,
+ tipc_link_min_win(snd_l), tipc_link_max_win(snd_l),
+ n->capabilities, &n->bc_entry.inputq1,
+ &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) {
+ pr_warn("Broadcast rcv link refresh failed, no memory\n");
+ tipc_node_write_unlock_fast(n);
+ tipc_node_put(n);
+ n = NULL;
+ goto exit;
+ }
n->preliminary = false;
n->addr = addr;
hlist_del_rcu(&n->hash);
@@ -567,7 +577,16 @@ update:
n->signature = INVALID_NODE_SIG;
n->active_links[0] = INVALID_BEARER_ID;
n->active_links[1] = INVALID_BEARER_ID;
- n->bc_entry.link = NULL;
+ if (!preliminary &&
+ !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX,
+ tipc_link_min_win(snd_l), tipc_link_max_win(snd_l),
+ n->capabilities, &n->bc_entry.inputq1,
+ &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) {
+ pr_warn("Broadcast rcv link creation failed, no memory\n");
+ kfree(n);
+ n = NULL;
+ goto exit;
+ }
tipc_node_get(n);
timer_setup(&n->timer, tipc_node_timeout, 0);
/* Start a slow timer anyway, crypto needs it */
@@ -1155,7 +1174,7 @@ void tipc_node_check_dest(struct net *net, u32 addr,
bool *respond, bool *dupl_addr)
{
struct tipc_node *n;
- struct tipc_link *l, *snd_l;
+ struct tipc_link *l;
struct tipc_link_entry *le;
bool addr_match = false;
bool sign_match = false;
@@ -1175,22 +1194,6 @@ void tipc_node_check_dest(struct net *net, u32 addr,
return;
tipc_node_write_lock(n);
- if (unlikely(!n->bc_entry.link)) {
- snd_l = tipc_bc_sndlink(net);
- if (!tipc_link_bc_create(net, tipc_own_addr(net),
- addr, peer_id, U16_MAX,
- tipc_link_min_win(snd_l),
- tipc_link_max_win(snd_l),
- n->capabilities,
- &n->bc_entry.inputq1,
- &n->bc_entry.namedq, snd_l,
- &n->bc_entry.link)) {
- pr_warn("Broadcast rcv link creation failed, no mem\n");
- tipc_node_write_unlock_fast(n);
- tipc_node_put(n);
- return;
- }
- }
le = &n->links[b->identity];
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 17f8c523e33b..43509c7e90fc 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -502,6 +502,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
sock_init_data(sock, sk);
tipc_set_sk_state(sk, TIPC_OPEN);
if (tipc_sk_insert(tsk)) {
+ sk_free(sk);
pr_warn("Socket create failed; port number exhausted\n");
return -EINVAL;
}
diff --git a/net/tls/tls.h b/net/tls/tls.h
new file mode 100644
index 000000000000..8005ee25157d
--- /dev/null
+++ b/net/tls/tls.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TLS_INT_H
+#define _TLS_INT_H
+
+#include <asm/byteorder.h>
+#include <linux/types.h>
+#include <linux/skmsg.h>
+#include <net/tls.h>
+
+#define __TLS_INC_STATS(net, field) \
+ __SNMP_INC_STATS((net)->mib.tls_statistics, field)
+#define TLS_INC_STATS(net, field) \
+ SNMP_INC_STATS((net)->mib.tls_statistics, field)
+#define TLS_DEC_STATS(net, field) \
+ SNMP_DEC_STATS((net)->mib.tls_statistics, field)
+
+/* TLS records are maintained in 'struct tls_rec'. It stores the memory pages
+ * allocated or mapped for each TLS record. After encryption, the records are
+ * stores in a linked list.
+ */
+struct tls_rec {
+ struct list_head list;
+ int tx_ready;
+ int tx_flags;
+
+ struct sk_msg msg_plaintext;
+ struct sk_msg msg_encrypted;
+
+ /* AAD | msg_plaintext.sg.data | sg_tag */
+ struct scatterlist sg_aead_in[2];
+ /* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */
+ struct scatterlist sg_aead_out[2];
+
+ char content_type;
+ struct scatterlist sg_content_type;
+
+ char aad_space[TLS_AAD_SPACE_SIZE];
+ u8 iv_data[MAX_IV_SIZE];
+ struct aead_request aead_req;
+ u8 aead_req_ctx[];
+};
+
+int __net_init tls_proc_init(struct net *net);
+void __net_exit tls_proc_fini(struct net *net);
+
+struct tls_context *tls_ctx_create(struct sock *sk);
+void tls_ctx_free(struct sock *sk, struct tls_context *ctx);
+void update_sk_prot(struct sock *sk, struct tls_context *ctx);
+
+int wait_on_pending_writer(struct sock *sk, long *timeo);
+int tls_sk_query(struct sock *sk, int optname, char __user *optval,
+ int __user *optlen);
+int tls_sk_attach(struct sock *sk, int optname, char __user *optval,
+ unsigned int optlen);
+void tls_err_abort(struct sock *sk, int err);
+
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
+void tls_update_rx_zc_capable(struct tls_context *tls_ctx);
+void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx);
+void tls_sw_strparser_done(struct tls_context *tls_ctx);
+int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int tls_sw_sendpage_locked(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags);
+int tls_sw_sendpage(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags);
+void tls_sw_cancel_work_tx(struct tls_context *tls_ctx);
+void tls_sw_release_resources_tx(struct sock *sk);
+void tls_sw_free_ctx_tx(struct tls_context *tls_ctx);
+void tls_sw_free_resources_rx(struct sock *sk);
+void tls_sw_release_resources_rx(struct sock *sk);
+void tls_sw_free_ctx_rx(struct tls_context *tls_ctx);
+int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len);
+bool tls_sw_sock_is_readable(struct sock *sk);
+ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags);
+
+int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int tls_device_sendpage(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags);
+int tls_tx_records(struct sock *sk, int flags);
+
+void tls_sw_write_space(struct sock *sk, struct tls_context *ctx);
+void tls_device_write_space(struct sock *sk, struct tls_context *ctx);
+
+int tls_process_cmsg(struct sock *sk, struct msghdr *msg,
+ unsigned char *record_type);
+int decrypt_skb(struct sock *sk, struct sk_buff *skb,
+ struct scatterlist *sgout);
+
+int tls_sw_fallback_init(struct sock *sk,
+ struct tls_offload_context_tx *offload_ctx,
+ struct tls_crypto_info *crypto_info);
+
+static inline struct tls_msg *tls_msg(struct sk_buff *skb)
+{
+ struct sk_skb_cb *scb = (struct sk_skb_cb *)skb->cb;
+
+ return &scb->tls;
+}
+
+#ifdef CONFIG_TLS_DEVICE
+void tls_device_init(void);
+void tls_device_cleanup(void);
+int tls_set_device_offload(struct sock *sk, struct tls_context *ctx);
+void tls_device_free_resources_tx(struct sock *sk);
+int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx);
+void tls_device_offload_cleanup_rx(struct sock *sk);
+void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq);
+int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+ struct sk_buff *skb, struct strp_msg *rxm);
+#else
+static inline void tls_device_init(void) {}
+static inline void tls_device_cleanup(void) {}
+
+static inline int
+tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void tls_device_free_resources_tx(struct sock *sk) {}
+
+static inline int
+tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void tls_device_offload_cleanup_rx(struct sock *sk) {}
+static inline void
+tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {}
+
+static inline int
+tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+ struct sk_buff *skb, struct strp_msg *rxm)
+{
+ return 0;
+}
+#endif
+
+int tls_push_sg(struct sock *sk, struct tls_context *ctx,
+ struct scatterlist *sg, u16 first_offset,
+ int flags);
+int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
+ int flags);
+void tls_free_partial_record(struct sock *sk, struct tls_context *ctx);
+
+static inline bool tls_is_partially_sent_record(struct tls_context *ctx)
+{
+ return !!ctx->partially_sent_record;
+}
+
+static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx)
+{
+ return tls_ctx->pending_open_record_frags;
+}
+
+static inline bool tls_bigint_increment(unsigned char *seq, int len)
+{
+ int i;
+
+ for (i = len - 1; i >= 0; i--) {
+ ++seq[i];
+ if (seq[i] != 0)
+ break;
+ }
+
+ return (i == -1);
+}
+
+static inline void tls_bigint_subtract(unsigned char *seq, int n)
+{
+ u64 rcd_sn;
+ __be64 *p;
+
+ BUILD_BUG_ON(TLS_MAX_REC_SEQ_SIZE != 8);
+
+ p = (__be64 *)seq;
+ rcd_sn = be64_to_cpu(*p);
+ *p = cpu_to_be64(rcd_sn - n);
+}
+
+static inline void
+tls_advance_record_sn(struct sock *sk, struct tls_prot_info *prot,
+ struct cipher_context *ctx)
+{
+ if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size))
+ tls_err_abort(sk, -EBADMSG);
+
+ if (prot->version != TLS_1_3_VERSION &&
+ prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305)
+ tls_bigint_increment(ctx->iv + prot->salt_size,
+ prot->iv_size);
+}
+
+static inline void
+tls_xor_iv_with_seq(struct tls_prot_info *prot, char *iv, char *seq)
+{
+ int i;
+
+ if (prot->version == TLS_1_3_VERSION ||
+ prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) {
+ for (i = 0; i < 8; i++)
+ iv[i + 4] ^= seq[i];
+ }
+}
+
+static inline void
+tls_fill_prepend(struct tls_context *ctx, char *buf, size_t plaintext_len,
+ unsigned char record_type)
+{
+ struct tls_prot_info *prot = &ctx->prot_info;
+ size_t pkt_len, iv_size = prot->iv_size;
+
+ pkt_len = plaintext_len + prot->tag_size;
+ if (prot->version != TLS_1_3_VERSION &&
+ prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) {
+ pkt_len += iv_size;
+
+ memcpy(buf + TLS_NONCE_OFFSET,
+ ctx->tx.iv + prot->salt_size, iv_size);
+ }
+
+ /* we cover nonce explicit here as well, so buf should be of
+ * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE
+ */
+ buf[0] = prot->version == TLS_1_3_VERSION ?
+ TLS_RECORD_TYPE_DATA : record_type;
+ /* Note that VERSION must be TLS_1_2 for both TLS1.2 and TLS1.3 */
+ buf[1] = TLS_1_2_VERSION_MINOR;
+ buf[2] = TLS_1_2_VERSION_MAJOR;
+ /* we can use IV for nonce explicit according to spec */
+ buf[3] = pkt_len >> 8;
+ buf[4] = pkt_len & 0xFF;
+}
+
+static inline
+void tls_make_aad(char *buf, size_t size, char *record_sequence,
+ unsigned char record_type, struct tls_prot_info *prot)
+{
+ if (prot->version != TLS_1_3_VERSION) {
+ memcpy(buf, record_sequence, prot->rec_seq_size);
+ buf += 8;
+ } else {
+ size += prot->tag_size;
+ }
+
+ buf[0] = prot->version == TLS_1_3_VERSION ?
+ TLS_RECORD_TYPE_DATA : record_type;
+ buf[1] = TLS_1_2_VERSION_MAJOR;
+ buf[2] = TLS_1_2_VERSION_MINOR;
+ buf[3] = size >> 8;
+ buf[4] = size & 0xFF;
+}
+
+#endif
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index ec6f4b699a2b..227b92a3064a 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -38,6 +38,7 @@
#include <net/tcp.h>
#include <net/tls.h>
+#include "tls.h"
#include "trace.h"
/* device_offload_lock is used to synchronize tls_dev_add
@@ -562,7 +563,7 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
lock_sock(sk);
if (unlikely(msg->msg_controllen)) {
- rc = tls_proccess_cmsg(sk, msg, &record_type);
+ rc = tls_process_cmsg(sk, msg, &record_type);
if (rc)
goto out;
}
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index e40bedd112b6..618cee704217 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -34,6 +34,8 @@
#include <crypto/scatterwalk.h>
#include <net/ip6_checksum.h>
+#include "tls.h"
+
static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk)
{
struct scatterlist *src = walk->sg;
@@ -232,7 +234,7 @@ static int fill_sg_in(struct scatterlist *sg_in,
s32 *sync_size,
int *resync_sgs)
{
- int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int tcp_payload_offset = skb_tcp_all_headers(skb);
int payload_len = skb->len - tcp_payload_offset;
u32 tcp_seq = ntohl(tcp_hdr(skb)->seq);
struct tls_record_info *record;
@@ -310,8 +312,8 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
struct sk_buff *skb,
s32 sync_size, u64 rcd_sn)
{
- int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
+ int tcp_payload_offset = skb_tcp_all_headers(skb);
int payload_len = skb->len - tcp_payload_offset;
void *buf, *iv, *aad, *dummy_buf;
struct aead_request *aead_req;
@@ -372,7 +374,7 @@ free_nskb:
static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb)
{
- int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int tcp_payload_offset = skb_tcp_all_headers(skb);
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
int payload_len = skb->len - tcp_payload_offset;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index da176411c1b5..f3d9dbfa507e 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -45,6 +45,8 @@
#include <net/tls.h>
#include <net/tls_toe.h>
+#include "tls.h"
+
MODULE_AUTHOR("Mellanox Technologies");
MODULE_DESCRIPTION("Transport Layer Security Support");
MODULE_LICENSE("Dual BSD/GPL");
@@ -164,8 +166,8 @@ static int tls_handle_open_record(struct sock *sk, int flags)
return 0;
}
-int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
- unsigned char *record_type)
+int tls_process_cmsg(struct sock *sk, struct msghdr *msg,
+ unsigned char *record_type)
{
struct cmsghdr *cmsg;
int rc = -EINVAL;
@@ -533,6 +535,37 @@ static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval,
return 0;
}
+static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval,
+ int __user *optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ unsigned int value;
+ int err, len;
+
+ if (ctx->prot_info.version != TLS_1_3_VERSION)
+ return -EINVAL;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < sizeof(value))
+ return -EINVAL;
+
+ lock_sock(sk);
+ err = -EINVAL;
+ if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW)
+ value = ctx->rx_no_pad;
+ release_sock(sk);
+ if (err)
+ return err;
+
+ if (put_user(sizeof(value), optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &value, sizeof(value)))
+ return -EFAULT;
+
+ return 0;
+}
+
static int do_tls_getsockopt(struct sock *sk, int optname,
char __user *optval, int __user *optlen)
{
@@ -547,6 +580,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
case TLS_TX_ZEROCOPY_RO:
rc = do_tls_getsockopt_tx_zc(sk, optval, optlen);
break;
+ case TLS_RX_EXPECT_NO_PAD:
+ rc = do_tls_getsockopt_no_pad(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -718,6 +754,38 @@ static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval,
return 0;
}
+static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ u32 val;
+ int rc;
+
+ if (ctx->prot_info.version != TLS_1_3_VERSION ||
+ sockptr_is_null(optval) || optlen < sizeof(val))
+ return -EINVAL;
+
+ rc = copy_from_sockptr(&val, optval, sizeof(val));
+ if (rc)
+ return -EFAULT;
+ if (val > 1)
+ return -EINVAL;
+ rc = check_zeroed_sockptr(optval, sizeof(val), optlen - sizeof(val));
+ if (rc < 1)
+ return rc == 0 ? -EINVAL : rc;
+
+ lock_sock(sk);
+ rc = -EINVAL;
+ if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) {
+ ctx->rx_no_pad = val;
+ tls_update_rx_zc_capable(ctx);
+ rc = 0;
+ }
+ release_sock(sk);
+
+ return rc;
+}
+
static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
unsigned int optlen)
{
@@ -736,6 +804,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
rc = do_tls_setsockopt_tx_zc(sk, optval, optlen);
release_sock(sk);
break;
+ case TLS_RX_EXPECT_NO_PAD:
+ rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -921,6 +992,8 @@ static void tls_update(struct sock *sk, struct proto *p,
{
struct tls_context *ctx;
+ WARN_ON_ONCE(sk->sk_prot == p);
+
ctx = tls_get_ctx(sk);
if (likely(ctx)) {
ctx->sk_write_space = write_space;
@@ -932,6 +1005,23 @@ static void tls_update(struct sock *sk, struct proto *p,
}
}
+static u16 tls_user_config(struct tls_context *ctx, bool tx)
+{
+ u16 config = tx ? ctx->tx_conf : ctx->rx_conf;
+
+ switch (config) {
+ case TLS_BASE:
+ return TLS_CONF_BASE;
+ case TLS_SW:
+ return TLS_CONF_SW;
+ case TLS_HW:
+ return TLS_CONF_HW;
+ case TLS_HW_RECORD:
+ return TLS_CONF_HW_RECORD;
+ }
+ return 0;
+}
+
static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
{
u16 version, cipher_type;
@@ -974,6 +1064,11 @@ static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
if (err)
goto nla_failure;
}
+ if (ctx->rx_no_pad) {
+ err = nla_put_flag(skb, TLS_INFO_RX_NO_PAD);
+ if (err)
+ goto nla_failure;
+ }
rcu_read_unlock();
nla_nest_end(skb, start);
@@ -995,6 +1090,7 @@ static size_t tls_get_info_size(const struct sock *sk)
nla_total_size(sizeof(u16)) + /* TLS_INFO_RXCONF */
nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */
+ nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */
0;
return size;
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index feeceb0e4cb4..1246e52b48f6 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -6,6 +6,8 @@
#include <net/snmp.h>
#include <net/tls.h>
+#include "tls.h"
+
#ifdef CONFIG_PROC_FS
static const struct snmp_mib tls_mib_list[] = {
SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW),
@@ -18,6 +20,7 @@ static const struct snmp_mib tls_mib_list[] = {
SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE),
SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR),
SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC),
+ SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIN_TLSDECRYPTRETRY),
SNMP_MIB_SENTINEL
};
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 0513f82b8537..09370f853031 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -44,9 +44,19 @@
#include <net/strparser.h>
#include <net/tls.h>
+#include "tls.h"
+
struct tls_decrypt_arg {
bool zc;
bool async;
+ u8 tail;
+};
+
+struct tls_decrypt_ctx {
+ u8 iv[MAX_IV_SIZE];
+ u8 aad[TLS_MAX_AAD_SIZE];
+ u8 tail;
+ struct scatterlist sg[];
};
noinline void tls_err_abort(struct sock *sk, int err)
@@ -133,7 +143,8 @@ static int skb_nsg(struct sk_buff *skb, int offset, int len)
return __skb_nsg(skb, offset, len, 0);
}
-static int padding_length(struct tls_prot_info *prot, struct sk_buff *skb)
+static int tls_padding_length(struct tls_prot_info *prot, struct sk_buff *skb,
+ struct tls_decrypt_arg *darg)
{
struct strp_msg *rxm = strp_msg(skb);
struct tls_msg *tlm = tls_msg(skb);
@@ -142,7 +153,7 @@ static int padding_length(struct tls_prot_info *prot, struct sk_buff *skb)
/* Determine zero-padding length */
if (prot->version == TLS_1_3_VERSION) {
int offset = rxm->full_len - TLS_TAG_SIZE - 1;
- char content_type = 0;
+ char content_type = darg->zc ? darg->tail : 0;
int err;
while (content_type == 0) {
@@ -267,9 +278,6 @@ static int tls_do_decryption(struct sock *sk,
}
darg->async = false;
- if (ret == -EBADMSG)
- TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR);
-
return ret;
}
@@ -518,7 +526,8 @@ static int tls_do_encryption(struct sock *sk,
memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv,
prot->iv_size + prot->salt_size);
- xor_iv_with_seq(prot, rec->iv_data + iv_offset, tls_ctx->tx.rec_seq);
+ tls_xor_iv_with_seq(prot, rec->iv_data + iv_offset,
+ tls_ctx->tx.rec_seq);
sge->offset += prot->prepend_size;
sge->length -= prot->prepend_size;
@@ -955,7 +964,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
lock_sock(sk);
if (unlikely(msg->msg_controllen)) {
- ret = tls_proccess_cmsg(sk, msg, &record_type);
+ ret = tls_process_cmsg(sk, msg, &record_type);
if (ret) {
if (ret == -EINPROGRESS)
num_async++;
@@ -1293,54 +1302,50 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
return ret;
}
-static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock,
- bool nonblock, long timeo, int *err)
+static int
+tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
+ long timeo)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- struct sk_buff *skb;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
- while (!(skb = ctx->recv_pkt) && sk_psock_queue_empty(psock)) {
- if (sk->sk_err) {
- *err = sock_error(sk);
- return NULL;
- }
+ while (!ctx->recv_pkt) {
+ if (!sk_psock_queue_empty(psock))
+ return 0;
+
+ if (sk->sk_err)
+ return sock_error(sk);
if (!skb_queue_empty(&sk->sk_receive_queue)) {
__strp_unpause(&ctx->strp);
if (ctx->recv_pkt)
- return ctx->recv_pkt;
+ break;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
- return NULL;
+ return 0;
if (sock_flag(sk, SOCK_DONE))
- return NULL;
+ return 0;
- if (nonblock || !timeo) {
- *err = -EAGAIN;
- return NULL;
- }
+ if (nonblock || !timeo)
+ return -EAGAIN;
add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
sk_wait_event(sk, &timeo,
- ctx->recv_pkt != skb ||
- !sk_psock_queue_empty(psock),
+ ctx->recv_pkt || !sk_psock_queue_empty(psock),
&wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
remove_wait_queue(sk_sleep(sk), &wait);
/* Handle signals */
- if (signal_pending(current)) {
- *err = sock_intr_errno(timeo);
- return NULL;
- }
+ if (signal_pending(current))
+ return sock_intr_errno(timeo);
}
- return skb;
+ return 1;
}
static int tls_setup_from_iter(struct iov_iter *from,
@@ -1415,21 +1420,22 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct tls_prot_info *prot = &tls_ctx->prot_info;
+ int n_sgin, n_sgout, aead_size, err, pages = 0;
struct strp_msg *rxm = strp_msg(skb);
struct tls_msg *tlm = tls_msg(skb);
- int n_sgin, n_sgout, nsg, mem_size, aead_size, err, pages = 0;
struct aead_request *aead_req;
struct sk_buff *unused;
- u8 *aad, *iv, *mem = NULL;
struct scatterlist *sgin = NULL;
struct scatterlist *sgout = NULL;
- const int data_len = rxm->full_len - prot->overhead_size +
- prot->tail_size;
+ const int data_len = rxm->full_len - prot->overhead_size;
+ int tail_pages = !!prot->tail_size;
+ struct tls_decrypt_ctx *dctx;
int iv_offset = 0;
+ u8 *mem;
if (darg->zc && (out_iov || out_sg)) {
if (out_iov)
- n_sgout = 1 +
+ n_sgout = 1 + tail_pages +
iov_iter_npages_cap(out_iov, INT_MAX, data_len);
else
n_sgout = sg_nents(out_sg);
@@ -1447,36 +1453,30 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
/* Increment to accommodate AAD */
n_sgin = n_sgin + 1;
- nsg = n_sgin + n_sgout;
-
- aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv);
- mem_size = aead_size + (nsg * sizeof(struct scatterlist));
- mem_size = mem_size + prot->aad_size;
- mem_size = mem_size + MAX_IV_SIZE;
-
/* Allocate a single block of memory which contains
- * aead_req || sgin[] || sgout[] || aad || iv.
- * This order achieves correct alignment for aead_req, sgin, sgout.
+ * aead_req || tls_decrypt_ctx.
+ * Both structs are variable length.
*/
- mem = kmalloc(mem_size, sk->sk_allocation);
+ aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv);
+ mem = kmalloc(aead_size + struct_size(dctx, sg, n_sgin + n_sgout),
+ sk->sk_allocation);
if (!mem)
return -ENOMEM;
/* Segment the allocated memory */
aead_req = (struct aead_request *)mem;
- sgin = (struct scatterlist *)(mem + aead_size);
- sgout = sgin + n_sgin;
- aad = (u8 *)(sgout + n_sgout);
- iv = aad + prot->aad_size;
+ dctx = (struct tls_decrypt_ctx *)(mem + aead_size);
+ sgin = &dctx->sg[0];
+ sgout = &dctx->sg[n_sgin];
/* For CCM based ciphers, first byte of nonce+iv is a constant */
switch (prot->cipher_type) {
case TLS_CIPHER_AES_CCM_128:
- iv[0] = TLS_AES_CCM_IV_B0_BYTE;
+ dctx->iv[0] = TLS_AES_CCM_IV_B0_BYTE;
iv_offset = 1;
break;
case TLS_CIPHER_SM4_CCM:
- iv[0] = TLS_SM4_CCM_IV_B0_BYTE;
+ dctx->iv[0] = TLS_SM4_CCM_IV_B0_BYTE;
iv_offset = 1;
break;
}
@@ -1484,46 +1484,49 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
/* Prepare IV */
if (prot->version == TLS_1_3_VERSION ||
prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) {
- memcpy(iv + iv_offset, tls_ctx->rx.iv,
+ memcpy(&dctx->iv[iv_offset], tls_ctx->rx.iv,
prot->iv_size + prot->salt_size);
} else {
err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE,
- iv + iv_offset + prot->salt_size,
+ &dctx->iv[iv_offset] + prot->salt_size,
prot->iv_size);
- if (err < 0) {
- kfree(mem);
- return err;
- }
- memcpy(iv + iv_offset, tls_ctx->rx.iv, prot->salt_size);
+ if (err < 0)
+ goto exit_free;
+ memcpy(&dctx->iv[iv_offset], tls_ctx->rx.iv, prot->salt_size);
}
- xor_iv_with_seq(prot, iv + iv_offset, tls_ctx->rx.rec_seq);
+ tls_xor_iv_with_seq(prot, &dctx->iv[iv_offset], tls_ctx->rx.rec_seq);
/* Prepare AAD */
- tls_make_aad(aad, rxm->full_len - prot->overhead_size +
+ tls_make_aad(dctx->aad, rxm->full_len - prot->overhead_size +
prot->tail_size,
tls_ctx->rx.rec_seq, tlm->control, prot);
/* Prepare sgin */
sg_init_table(sgin, n_sgin);
- sg_set_buf(&sgin[0], aad, prot->aad_size);
+ sg_set_buf(&sgin[0], dctx->aad, prot->aad_size);
err = skb_to_sgvec(skb, &sgin[1],
rxm->offset + prot->prepend_size,
rxm->full_len - prot->prepend_size);
- if (err < 0) {
- kfree(mem);
- return err;
- }
+ if (err < 0)
+ goto exit_free;
if (n_sgout) {
if (out_iov) {
sg_init_table(sgout, n_sgout);
- sg_set_buf(&sgout[0], aad, prot->aad_size);
+ sg_set_buf(&sgout[0], dctx->aad, prot->aad_size);
err = tls_setup_from_iter(out_iov, data_len,
&pages, &sgout[1],
- (n_sgout - 1));
+ (n_sgout - 1 - tail_pages));
if (err < 0)
goto fallback_to_reg_recv;
+
+ if (prot->tail_size) {
+ sg_unmark_end(&sgout[pages]);
+ sg_set_buf(&sgout[pages + 1], &dctx->tail,
+ prot->tail_size);
+ sg_mark_end(&sgout[pages + 1]);
+ }
} else if (out_sg) {
memcpy(sgout, out_sg, n_sgout * sizeof(*sgout));
} else {
@@ -1537,15 +1540,18 @@ fallback_to_reg_recv:
}
/* Prepare and submit AEAD request */
- err = tls_do_decryption(sk, skb, sgin, sgout, iv,
- data_len, aead_req, darg);
+ err = tls_do_decryption(sk, skb, sgin, sgout, dctx->iv,
+ data_len + prot->tail_size, aead_req, darg);
if (darg->async)
return 0;
+ if (prot->tail_size)
+ darg->tail = dctx->tail;
+
/* Release the pages in case iov was mapped to pages */
for (; pages > 0; pages--)
put_page(sg_page(&sgout[pages]));
-
+exit_free:
kfree(mem);
return err;
}
@@ -1579,13 +1585,23 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
}
err = decrypt_internal(sk, skb, dest, NULL, darg);
- if (err < 0)
+ if (err < 0) {
+ if (err == -EBADMSG)
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR);
return err;
+ }
if (darg->async)
goto decrypt_next;
+ /* If opportunistic TLS 1.3 ZC failed retry without ZC */
+ if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION &&
+ darg->tail != TLS_RECORD_TYPE_DATA)) {
+ darg->zc = false;
+ TLS_INC_STATS(sock_net(sk), LINUX_MIN_TLSDECRYPTRETRY);
+ return decrypt_skb_update(sk, skb, dest, darg);
+ }
decrypt_done:
- pad = padding_length(prot, skb);
+ pad = tls_padding_length(prot, skb, darg);
if (pad < 0)
return pad;
@@ -1717,6 +1733,24 @@ out:
return copied ? : err;
}
+static void
+tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot,
+ size_t len_left, size_t decrypted, ssize_t done,
+ size_t *flushed_at)
+{
+ size_t max_rec;
+
+ if (len_left <= decrypted)
+ return;
+
+ max_rec = prot->overhead_size - prot->tail_size + TLS_MAX_PAYLOAD_SIZE;
+ if (done - *flushed_at < SZ_128K && tcp_inq(sk) > max_rec)
+ return;
+
+ *flushed_at = done;
+ sk_flush_backlog(sk);
+}
+
int tls_sw_recvmsg(struct sock *sk,
struct msghdr *msg,
size_t len,
@@ -1729,6 +1763,7 @@ int tls_sw_recvmsg(struct sock *sk,
struct sk_psock *psock;
unsigned char control = 0;
ssize_t decrypted = 0;
+ size_t flushed_at = 0;
struct strp_msg *rxm;
struct tls_msg *tlm;
struct sk_buff *skb;
@@ -1767,14 +1802,14 @@ int tls_sw_recvmsg(struct sock *sk,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
zc_capable = !bpf_strp_enabled && !is_kvec && !is_peek &&
- prot->version != TLS_1_3_VERSION;
+ ctx->zc_capable;
decrypted = 0;
while (len && (decrypted + copied < target || ctx->recv_pkt)) {
struct tls_decrypt_arg darg = {};
int to_decrypt, chunk;
- skb = tls_wait_data(sk, psock, flags & MSG_DONTWAIT, timeo, &err);
- if (!skb) {
+ err = tls_rx_rec_wait(sk, psock, flags & MSG_DONTWAIT, timeo);
+ if (err <= 0) {
if (psock) {
chunk = sk_msg_recvmsg(sk, psock, msg, len,
flags);
@@ -1784,6 +1819,7 @@ int tls_sw_recvmsg(struct sock *sk,
goto recv_end;
}
+ skb = ctx->recv_pkt;
rxm = strp_msg(skb);
tlm = tls_msg(skb);
@@ -1818,6 +1854,10 @@ int tls_sw_recvmsg(struct sock *sk,
if (err <= 0)
goto recv_end;
+ /* periodically flush backlog, and feed strparser */
+ tls_read_flush_backlog(sk, prot, len, to_decrypt,
+ decrypted + copied, &flushed_at);
+
ctx->recv_pkt = NULL;
__strp_unpause(&ctx->strp);
__skb_queue_tail(&ctx->rx_list, skb);
@@ -1946,11 +1986,13 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
} else {
struct tls_decrypt_arg darg = {};
- skb = tls_wait_data(sk, NULL, flags & SPLICE_F_NONBLOCK, timeo,
- &err);
- if (!skb)
+ err = tls_rx_rec_wait(sk, NULL, flags & SPLICE_F_NONBLOCK,
+ timeo);
+ if (err <= 0)
goto splice_read_end;
+ skb = ctx->recv_pkt;
+
err = decrypt_skb_update(sk, skb, NULL, &darg);
if (err < 0) {
tls_err_abort(sk, -EBADMSG);
@@ -2227,12 +2269,23 @@ static void tx_work_handler(struct work_struct *work)
mutex_unlock(&tls_ctx->tx_lock);
}
+static bool tls_is_tx_ready(struct tls_sw_context_tx *ctx)
+{
+ struct tls_rec *rec;
+
+ rec = list_first_entry(&ctx->tx_list, struct tls_rec, list);
+ if (!rec)
+ return false;
+
+ return READ_ONCE(rec->tx_ready);
+}
+
void tls_sw_write_space(struct sock *sk, struct tls_context *ctx)
{
struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx);
/* Schedule the transmission if tx list is ready */
- if (is_tx_ready(tx_ctx) &&
+ if (tls_is_tx_ready(tx_ctx) &&
!test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask))
schedule_delayed_work(&tx_ctx->tx_work.work, 0);
}
@@ -2249,6 +2302,14 @@ void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx)
strp_check_rcv(&rx_ctx->strp);
}
+void tls_update_rx_zc_capable(struct tls_context *tls_ctx)
+{
+ struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(tls_ctx);
+
+ rx_ctx->zc_capable = tls_ctx->rx_no_pad ||
+ tls_ctx->prot_info.version != TLS_1_3_VERSION;
+}
+
int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
@@ -2422,13 +2483,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
goto free_priv;
}
- /* Sanity-check the sizes for stack allocations. */
- if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE ||
- rec_seq_size > TLS_MAX_REC_SEQ_SIZE || tag_size != TLS_TAG_SIZE) {
- rc = -EINVAL;
- goto free_priv;
- }
-
if (crypto_info->version == TLS_1_3_VERSION) {
nonce_size = 0;
prot->aad_size = TLS_HEADER_SIZE;
@@ -2438,6 +2492,14 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
prot->tail_size = 0;
}
+ /* Sanity-check the sizes for stack allocations. */
+ if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE ||
+ rec_seq_size > TLS_MAX_REC_SEQ_SIZE || tag_size != TLS_TAG_SIZE ||
+ prot->aad_size > TLS_MAX_AAD_SIZE) {
+ rc = -EINVAL;
+ goto free_priv;
+ }
+
prot->version = crypto_info->version;
prot->cipher_type = crypto_info->cipher_type;
prot->prepend_size = TLS_HEADER_SIZE + nonce_size;
@@ -2484,12 +2546,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
if (sw_ctx_rx) {
tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv);
- if (crypto_info->version == TLS_1_3_VERSION)
- sw_ctx_rx->async_capable = 0;
- else
- sw_ctx_rx->async_capable =
- !!(tfm->__crt_alg->cra_flags &
- CRYPTO_ALG_ASYNC);
+ tls_update_rx_zc_capable(ctx);
+ sw_ctx_rx->async_capable =
+ crypto_info->version != TLS_1_3_VERSION &&
+ !!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC);
/* Set up strparser */
memset(&cb, 0, sizeof(cb));
diff --git a/net/tls/tls_toe.c b/net/tls/tls_toe.c
index 7e1330f19165..825669e1ab47 100644
--- a/net/tls/tls_toe.c
+++ b/net/tls/tls_toe.c
@@ -38,6 +38,8 @@
#include <net/tls.h>
#include <net/tls_toe.h>
+#include "tls.h"
+
static LIST_HEAD(device_list);
static DEFINE_SPINLOCK(device_spinlock);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1bed3739768c..bf338b782fc4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -118,15 +118,13 @@
#include "scm.h"
-spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
-EXPORT_SYMBOL_GPL(unix_table_locks);
-struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
-EXPORT_SYMBOL_GPL(unix_socket_table);
static atomic_long_t unix_nr_socks;
+static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
+static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
/* SMP locking strategy:
- * hash table is protected with spinlock unix_table_locks
- * each socket state is protected by separate spin lock.
+ * hash table is protected with spinlock.
+ * each socket state is protected by separate spinlock.
*/
static unsigned int unix_unbound_hash(struct sock *sk)
@@ -137,12 +135,12 @@ static unsigned int unix_unbound_hash(struct sock *sk)
hash ^= hash >> 8;
hash ^= sk->sk_type;
- return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1));
+ return hash & UNIX_HASH_MOD;
}
static unsigned int unix_bsd_hash(struct inode *i)
{
- return i->i_ino & (UNIX_HASH_SIZE - 1);
+ return i->i_ino & UNIX_HASH_MOD;
}
static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
@@ -155,26 +153,34 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
hash ^= hash >> 8;
hash ^= type;
- return hash & (UNIX_HASH_SIZE - 1);
+ return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
}
-static void unix_table_double_lock(unsigned int hash1, unsigned int hash2)
+static void unix_table_double_lock(struct net *net,
+ unsigned int hash1, unsigned int hash2)
{
- /* hash1 and hash2 is never the same because
- * one is between 0 and UNIX_HASH_SIZE - 1, and
- * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2.
- */
+ if (hash1 == hash2) {
+ spin_lock(&net->unx.table.locks[hash1]);
+ return;
+ }
+
if (hash1 > hash2)
swap(hash1, hash2);
- spin_lock(&unix_table_locks[hash1]);
- spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING);
+ spin_lock(&net->unx.table.locks[hash1]);
+ spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
}
-static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2)
+static void unix_table_double_unlock(struct net *net,
+ unsigned int hash1, unsigned int hash2)
{
- spin_unlock(&unix_table_locks[hash1]);
- spin_unlock(&unix_table_locks[hash2]);
+ if (hash1 == hash2) {
+ spin_unlock(&net->unx.table.locks[hash1]);
+ return;
+ }
+
+ spin_unlock(&net->unx.table.locks[hash1]);
+ spin_unlock(&net->unx.table.locks[hash2]);
}
#ifdef CONFIG_SECURITY_NETWORK
@@ -300,34 +306,52 @@ static void __unix_remove_socket(struct sock *sk)
sk_del_node_init(sk);
}
-static void __unix_insert_socket(struct sock *sk)
+static void __unix_insert_socket(struct net *net, struct sock *sk)
{
DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
- sk_add_node(sk, &unix_socket_table[sk->sk_hash]);
+ sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
}
-static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr,
- unsigned int hash)
+static void __unix_set_addr_hash(struct net *net, struct sock *sk,
+ struct unix_address *addr, unsigned int hash)
{
__unix_remove_socket(sk);
smp_store_release(&unix_sk(sk)->addr, addr);
sk->sk_hash = hash;
- __unix_insert_socket(sk);
+ __unix_insert_socket(net, sk);
}
-static void unix_remove_socket(struct sock *sk)
+static void unix_remove_socket(struct net *net, struct sock *sk)
{
- spin_lock(&unix_table_locks[sk->sk_hash]);
+ spin_lock(&net->unx.table.locks[sk->sk_hash]);
__unix_remove_socket(sk);
- spin_unlock(&unix_table_locks[sk->sk_hash]);
+ spin_unlock(&net->unx.table.locks[sk->sk_hash]);
+}
+
+static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
+{
+ spin_lock(&net->unx.table.locks[sk->sk_hash]);
+ __unix_insert_socket(net, sk);
+ spin_unlock(&net->unx.table.locks[sk->sk_hash]);
+}
+
+static void unix_insert_bsd_socket(struct sock *sk)
+{
+ spin_lock(&bsd_socket_locks[sk->sk_hash]);
+ sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
+ spin_unlock(&bsd_socket_locks[sk->sk_hash]);
}
-static void unix_insert_unbound_socket(struct sock *sk)
+static void unix_remove_bsd_socket(struct sock *sk)
{
- spin_lock(&unix_table_locks[sk->sk_hash]);
- __unix_insert_socket(sk);
- spin_unlock(&unix_table_locks[sk->sk_hash]);
+ if (!hlist_unhashed(&sk->sk_bind_node)) {
+ spin_lock(&bsd_socket_locks[sk->sk_hash]);
+ __sk_del_bind_node(sk);
+ spin_unlock(&bsd_socket_locks[sk->sk_hash]);
+
+ sk_node_init(&sk->sk_bind_node);
+ }
}
static struct sock *__unix_find_socket_byname(struct net *net,
@@ -336,12 +360,9 @@ static struct sock *__unix_find_socket_byname(struct net *net,
{
struct sock *s;
- sk_for_each(s, &unix_socket_table[hash]) {
+ sk_for_each(s, &net->unx.table.buckets[hash]) {
struct unix_sock *u = unix_sk(s);
- if (!net_eq(sock_net(s), net))
- continue;
-
if (u->addr->len == len &&
!memcmp(u->addr->name, sunname, len))
return s;
@@ -355,11 +376,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net,
{
struct sock *s;
- spin_lock(&unix_table_locks[hash]);
+ spin_lock(&net->unx.table.locks[hash]);
s = __unix_find_socket_byname(net, sunname, len, hash);
if (s)
sock_hold(s);
- spin_unlock(&unix_table_locks[hash]);
+ spin_unlock(&net->unx.table.locks[hash]);
return s;
}
@@ -368,17 +389,17 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
unsigned int hash = unix_bsd_hash(i);
struct sock *s;
- spin_lock(&unix_table_locks[hash]);
- sk_for_each(s, &unix_socket_table[hash]) {
+ spin_lock(&bsd_socket_locks[hash]);
+ sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
struct dentry *dentry = unix_sk(s)->path.dentry;
if (dentry && d_backing_inode(dentry) == i) {
sock_hold(s);
- spin_unlock(&unix_table_locks[hash]);
+ spin_unlock(&bsd_socket_locks[hash]);
return s;
}
}
- spin_unlock(&unix_table_locks[hash]);
+ spin_unlock(&bsd_socket_locks[hash]);
return NULL;
}
@@ -576,12 +597,13 @@ static void unix_sock_destructor(struct sock *sk)
static void unix_release_sock(struct sock *sk, int embrion)
{
struct unix_sock *u = unix_sk(sk);
- struct path path;
struct sock *skpair;
struct sk_buff *skb;
+ struct path path;
int state;
- unix_remove_socket(sk);
+ unix_remove_socket(sock_net(sk), sk);
+ unix_remove_bsd_socket(sk);
/* Clear state */
unix_state_lock(sk);
@@ -928,9 +950,9 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
init_waitqueue_head(&u->peer_wait);
init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
memset(&u->scm_stat, 0, sizeof(struct scm_stat));
- unix_insert_unbound_socket(sk);
+ unix_insert_unbound_socket(net, sk);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ sock_prot_inuse_add(net, sk->sk_prot, 1);
return sk;
@@ -991,8 +1013,8 @@ static int unix_release(struct socket *sock)
return 0;
}
-static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr,
- int addr_len, int type)
+static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
+ int type)
{
struct inode *inode;
struct path path;
@@ -1061,7 +1083,7 @@ static struct sock *unix_find_other(struct net *net,
struct sock *sk;
if (sunaddr->sun_path[0])
- sk = unix_find_bsd(net, sunaddr, addr_len, type);
+ sk = unix_find_bsd(sunaddr, addr_len, type);
else
sk = unix_find_abstract(net, sunaddr, addr_len, type);
@@ -1072,6 +1094,7 @@ static int unix_autobind(struct sock *sk)
{
unsigned int new_hash, old_hash = sk->sk_hash;
struct unix_sock *u = unix_sk(sk);
+ struct net *net = sock_net(sk);
struct unix_address *addr;
u32 lastnum, ordernum;
int err;
@@ -1100,11 +1123,10 @@ retry:
sprintf(addr->name->sun_path + 1, "%05x", ordernum);
new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
- unix_table_double_lock(old_hash, new_hash);
+ unix_table_double_lock(net, old_hash, new_hash);
- if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
- new_hash)) {
- unix_table_double_unlock(old_hash, new_hash);
+ if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
+ unix_table_double_unlock(net, old_hash, new_hash);
/* __unix_find_socket_byname() may take long time if many names
* are already in use.
@@ -1121,8 +1143,8 @@ retry:
goto retry;
}
- __unix_set_addr_hash(sk, addr, new_hash);
- unix_table_double_unlock(old_hash, new_hash);
+ __unix_set_addr_hash(net, sk, addr, new_hash);
+ unix_table_double_unlock(net, old_hash, new_hash);
err = 0;
out: mutex_unlock(&u->bindlock);
@@ -1136,6 +1158,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
(SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
unsigned int new_hash, old_hash = sk->sk_hash;
struct unix_sock *u = unix_sk(sk);
+ struct net *net = sock_net(sk);
struct user_namespace *ns; // barf...
struct unix_address *addr;
struct dentry *dentry;
@@ -1176,11 +1199,12 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
goto out_unlock;
new_hash = unix_bsd_hash(d_backing_inode(dentry));
- unix_table_double_lock(old_hash, new_hash);
+ unix_table_double_lock(net, old_hash, new_hash);
u->path.mnt = mntget(parent.mnt);
u->path.dentry = dget(dentry);
- __unix_set_addr_hash(sk, addr, new_hash);
- unix_table_double_unlock(old_hash, new_hash);
+ __unix_set_addr_hash(net, sk, addr, new_hash);
+ unix_table_double_unlock(net, old_hash, new_hash);
+ unix_insert_bsd_socket(sk);
mutex_unlock(&u->bindlock);
done_path_create(&parent, dentry);
return 0;
@@ -1203,6 +1227,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
{
unsigned int new_hash, old_hash = sk->sk_hash;
struct unix_sock *u = unix_sk(sk);
+ struct net *net = sock_net(sk);
struct unix_address *addr;
int err;
@@ -1220,19 +1245,18 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
}
new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
- unix_table_double_lock(old_hash, new_hash);
+ unix_table_double_lock(net, old_hash, new_hash);
- if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
- new_hash))
+ if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
goto out_spin;
- __unix_set_addr_hash(sk, addr, new_hash);
- unix_table_double_unlock(old_hash, new_hash);
+ __unix_set_addr_hash(net, sk, addr, new_hash);
+ unix_table_double_unlock(net, old_hash, new_hash);
mutex_unlock(&u->bindlock);
return 0;
out_spin:
- unix_table_double_unlock(old_hash, new_hash);
+ unix_table_double_unlock(net, old_hash, new_hash);
err = -EADDRINUSE;
out_mutex:
mutex_unlock(&u->bindlock);
@@ -1291,9 +1315,8 @@ static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
int alen, int flags)
{
- struct sock *sk = sock->sk;
- struct net *net = sock_net(sk);
struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
+ struct sock *sk = sock->sk;
struct sock *other;
int err;
@@ -1314,7 +1337,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
}
restart:
- other = unix_find_other(net, sunaddr, alen, sock->type);
+ other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
if (IS_ERR(other)) {
err = PTR_ERR(other);
goto out;
@@ -1402,15 +1425,13 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
- struct sock *sk = sock->sk;
- struct net *net = sock_net(sk);
+ struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
struct unix_sock *u = unix_sk(sk), *newu, *otheru;
- struct sock *newsk = NULL;
- struct sock *other = NULL;
+ struct net *net = sock_net(sk);
struct sk_buff *skb = NULL;
- int st;
- int err;
long timeo;
+ int err;
+ int st;
err = unix_validate_addr(sunaddr, addr_len);
if (err)
@@ -1430,7 +1451,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
*/
/* create new sock for complete connection */
- newsk = unix_create1(sock_net(sk), NULL, 0, sock->type);
+ newsk = unix_create1(net, NULL, 0, sock->type);
if (IS_ERR(newsk)) {
err = PTR_ERR(newsk);
newsk = NULL;
@@ -1539,9 +1560,9 @@ restart:
*
* The contents of *(otheru->addr) and otheru->path
* are seen fully set up here, since we have found
- * otheru in hash under unix_table_locks. Insertion
- * into the hash chain we'd found it in had been done
- * in an earlier critical area protected by unix_table_locks,
+ * otheru in hash under its lock. Insertion into the
+ * hash chain we'd found it in had been done in an
+ * earlier critical area protected by the chain's lock,
* the same one where we'd set *(otheru->addr) contents,
* as well as otheru->path and otheru->addr itself.
*
@@ -1838,17 +1859,15 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
size_t len)
{
- struct sock *sk = sock->sk;
- struct net *net = sock_net(sk);
- struct unix_sock *u = unix_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
- struct sock *other = NULL;
- int err;
- struct sk_buff *skb;
- long timeo;
+ struct sock *sk = sock->sk, *other = NULL;
+ struct unix_sock *u = unix_sk(sk);
struct scm_cookie scm;
+ struct sk_buff *skb;
int data_len = 0;
int sk_locked;
+ long timeo;
+ int err;
wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false);
@@ -1915,7 +1934,7 @@ restart:
if (sunaddr == NULL)
goto out_free;
- other = unix_find_other(net, sunaddr, msg->msg_namelen,
+ other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
sk->sk_type);
if (IS_ERR(other)) {
err = PTR_ERR(other);
@@ -3221,12 +3240,11 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
{
unsigned long offset = get_offset(*pos);
unsigned long bucket = get_bucket(*pos);
- struct sock *sk;
unsigned long count = 0;
+ struct sock *sk;
- for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
- if (sock_net(sk) != seq_file_net(seq))
- continue;
+ for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
+ sk; sk = sk_next(sk)) {
if (++count == offset)
break;
}
@@ -3237,16 +3255,17 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
{
unsigned long bucket = get_bucket(*pos);
+ struct net *net = seq_file_net(seq);
struct sock *sk;
- while (bucket < ARRAY_SIZE(unix_socket_table)) {
- spin_lock(&unix_table_locks[bucket]);
+ while (bucket < UNIX_HASH_SIZE) {
+ spin_lock(&net->unx.table.locks[bucket]);
sk = unix_from_bucket(seq, pos);
if (sk)
return sk;
- spin_unlock(&unix_table_locks[bucket]);
+ spin_unlock(&net->unx.table.locks[bucket]);
*pos = set_bucket_offset(++bucket, 1);
}
@@ -3259,11 +3278,12 @@ static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
{
unsigned long bucket = get_bucket(*pos);
- for (sk = sk_next(sk); sk; sk = sk_next(sk))
- if (sock_net(sk) == seq_file_net(seq))
- return sk;
+ sk = sk_next(sk);
+ if (sk)
+ return sk;
+
- spin_unlock(&unix_table_locks[bucket]);
+ spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
*pos = set_bucket_offset(++bucket, 1);
@@ -3293,7 +3313,7 @@ static void unix_seq_stop(struct seq_file *seq, void *v)
struct sock *sk = v;
if (sk)
- spin_unlock(&unix_table_locks[sk->sk_hash]);
+ spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
}
static int unix_seq_show(struct seq_file *seq, void *v)
@@ -3318,7 +3338,7 @@ static int unix_seq_show(struct seq_file *seq, void *v)
(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
sock_i_ino(s));
- if (u->addr) { // under unix_table_locks here
+ if (u->addr) { // under a hash table lock here
int i, len;
seq_putc(seq, ' ');
@@ -3388,9 +3408,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
iter->batch[iter->end_sk++] = start_sk;
for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
- if (sock_net(sk) != seq_file_net(seq))
- continue;
-
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
iter->batch[iter->end_sk++] = sk;
@@ -3399,7 +3416,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
expected++;
}
- spin_unlock(&unix_table_locks[start_sk->sk_hash]);
+ spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
return expected;
}
@@ -3559,7 +3576,7 @@ static const struct net_proto_family unix_family_ops = {
static int __net_init unix_net_init(struct net *net)
{
- int error = -ENOMEM;
+ int i;
net->unx.sysctl_max_dgram_qlen = 10;
if (unix_sysctl_register(net))
@@ -3567,18 +3584,44 @@ static int __net_init unix_net_init(struct net *net)
#ifdef CONFIG_PROC_FS
if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
- sizeof(struct seq_net_private))) {
- unix_sysctl_unregister(net);
- goto out;
+ sizeof(struct seq_net_private)))
+ goto err_sysctl;
+#endif
+
+ net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
+ sizeof(spinlock_t), GFP_KERNEL);
+ if (!net->unx.table.locks)
+ goto err_proc;
+
+ net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!net->unx.table.buckets)
+ goto free_locks;
+
+ for (i = 0; i < UNIX_HASH_SIZE; i++) {
+ spin_lock_init(&net->unx.table.locks[i]);
+ INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
}
+
+ return 0;
+
+free_locks:
+ kvfree(net->unx.table.locks);
+err_proc:
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("unix", net->proc_net);
+err_sysctl:
#endif
- error = 0;
+ unix_sysctl_unregister(net);
out:
- return error;
+ return -ENOMEM;
}
static void __net_exit unix_net_exit(struct net *net)
{
+ kvfree(net->unx.table.buckets);
+ kvfree(net->unx.table.locks);
unix_sysctl_unregister(net);
remove_proc_entry("unix", net->proc_net);
}
@@ -3666,8 +3709,10 @@ static int __init af_unix_init(void)
BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
- for (i = 0; i < 2 * UNIX_HASH_SIZE; i++)
- spin_lock_init(&unix_table_locks[i]);
+ for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
+ spin_lock_init(&bsd_socket_locks[i]);
+ INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
+ }
rc = proto_register(&unix_dgram_proto, 1);
if (rc != 0) {
diff --git a/net/unix/diag.c b/net/unix/diag.c
index bb0b5ea1655f..105f522a89fe 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -13,7 +13,7 @@
static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
{
- /* might or might not have unix_table_locks */
+ /* might or might not have a hash table lock */
struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
if (!addr)
@@ -195,25 +195,21 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct unix_diag_req *req;
- int num, s_num, slot, s_slot;
struct net *net = sock_net(skb->sk);
+ int num, s_num, slot, s_slot;
+ struct unix_diag_req *req;
req = nlmsg_data(cb->nlh);
s_slot = cb->args[0];
num = s_num = cb->args[1];
- for (slot = s_slot;
- slot < ARRAY_SIZE(unix_socket_table);
- s_num = 0, slot++) {
+ for (slot = s_slot; slot < UNIX_HASH_SIZE; s_num = 0, slot++) {
struct sock *sk;
num = 0;
- spin_lock(&unix_table_locks[slot]);
- sk_for_each(sk, &unix_socket_table[slot]) {
- if (!net_eq(sock_net(sk), net))
- continue;
+ spin_lock(&net->unx.table.locks[slot]);
+ sk_for_each(sk, &net->unx.table.buckets[slot]) {
if (num < s_num)
goto next;
if (!(req->udiag_states & (1 << sk->sk_state)))
@@ -222,13 +218,13 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI) < 0) {
- spin_unlock(&unix_table_locks[slot]);
+ spin_unlock(&net->unx.table.locks[slot]);
goto done;
}
next:
num++;
}
- spin_unlock(&unix_table_locks[slot]);
+ spin_unlock(&net->unx.table.locks[slot]);
}
done:
cb->args[0] = slot;
@@ -237,20 +233,21 @@ done:
return skb->len;
}
-static struct sock *unix_lookup_by_ino(unsigned int ino)
+static struct sock *unix_lookup_by_ino(struct net *net, unsigned int ino)
{
struct sock *sk;
int i;
- for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) {
- spin_lock(&unix_table_locks[i]);
- sk_for_each(sk, &unix_socket_table[i])
+ for (i = 0; i < UNIX_HASH_SIZE; i++) {
+ spin_lock(&net->unx.table.locks[i]);
+ sk_for_each(sk, &net->unx.table.buckets[i]) {
if (ino == sock_i_ino(sk)) {
sock_hold(sk);
- spin_unlock(&unix_table_locks[i]);
+ spin_unlock(&net->unx.table.locks[i]);
return sk;
}
- spin_unlock(&unix_table_locks[i]);
+ }
+ spin_unlock(&net->unx.table.locks[i]);
}
return NULL;
}
@@ -259,21 +256,20 @@ static int unix_diag_get_exact(struct sk_buff *in_skb,
const struct nlmsghdr *nlh,
struct unix_diag_req *req)
{
- int err = -EINVAL;
- struct sock *sk;
- struct sk_buff *rep;
- unsigned int extra_len;
struct net *net = sock_net(in_skb->sk);
+ unsigned int extra_len;
+ struct sk_buff *rep;
+ struct sock *sk;
+ int err;
+ err = -EINVAL;
if (req->udiag_ino == 0)
goto out_nosk;
- sk = unix_lookup_by_ino(req->udiag_ino);
+ sk = unix_lookup_by_ino(net, req->udiag_ino);
err = -ENOENT;
if (sk == NULL)
goto out_nosk;
- if (!net_eq(sock_net(sk), net))
- goto out;
err = sock_diag_check_cookie(sk, req->udiag_cookie);
if (err)
@@ -308,7 +304,6 @@ out_nosk:
static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
{
int hdrlen = sizeof(struct unix_diag_req);
- struct net *net = sock_net(skb->sk);
if (nlmsg_len(h) < hdrlen)
return -EINVAL;
@@ -317,7 +312,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
struct netlink_dump_control c = {
.dump = unix_diag_dump,
};
- return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ return netlink_dump_start(sock_net(skb->sk)->diag_nlsk, skb, h, &c);
} else
return unix_diag_get_exact(skb, h, nlmsg_data(h));
}
diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c
index 01d44e2598e2..500129aa710c 100644
--- a/net/unix/sysctl_net_unix.c
+++ b/net/unix/sysctl_net_unix.c
@@ -26,11 +26,16 @@ int __net_init unix_sysctl_register(struct net *net)
{
struct ctl_table *table;
- table = kmemdup(unix_table, sizeof(unix_table), GFP_KERNEL);
- if (table == NULL)
- goto err_alloc;
+ if (net_eq(net, &init_net)) {
+ table = unix_table;
+ } else {
+ table = kmemdup(unix_table, sizeof(unix_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ table[0].data = &net->unx.sysctl_max_dgram_qlen;
+ }
- table[0].data = &net->unx.sysctl_max_dgram_qlen;
net->unx.ctl = register_net_sysctl(net, "net/unix", table);
if (net->unx.ctl == NULL)
goto err_reg;
@@ -38,7 +43,8 @@ int __net_init unix_sysctl_register(struct net *net)
return 0;
err_reg:
- kfree(table);
+ if (!net_eq(net, &init_net))
+ kfree(table);
err_alloc:
return -ENOMEM;
}
@@ -49,5 +55,6 @@ void unix_sysctl_unregister(struct net *net)
table = net->unx.ctl->ctl_table_arg;
unregister_net_sysctl_table(net->unx.ctl);
- kfree(table);
+ if (!net_eq(net, &init_net))
+ kfree(table);
}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 19ac872a6624..09002387987e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -538,12 +538,6 @@ static int xsk_generic_xmit(struct sock *sk)
goto out;
}
- skb = xsk_build_skb(xs, &desc);
- if (IS_ERR(skb)) {
- err = PTR_ERR(skb);
- goto out;
- }
-
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
* if there is space in it. This avoids having to implement
@@ -552,11 +546,19 @@ static int xsk_generic_xmit(struct sock *sk)
spin_lock_irqsave(&xs->pool->cq_lock, flags);
if (xskq_prod_reserve(xs->pool->cq)) {
spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
- kfree_skb(skb);
goto out;
}
spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+ skb = xsk_build_skb(xs, &desc);
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
+ spin_lock_irqsave(&xs->pool->cq_lock, flags);
+ xskq_prod_cancel(xs->pool->cq);
+ spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+ goto out;
+ }
+
err = __dev_direct_xmit(skb, xs->queue_id);
if (err == NETDEV_TX_BUSY) {
/* Tell user-space to retry the send */
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 87bdd71c7bb6..f70112176b7c 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -332,6 +332,7 @@ static void __xp_dma_unmap(struct xsk_dma_map *dma_map, unsigned long attrs)
for (i = 0; i < dma_map->dma_pages_cnt; i++) {
dma = &dma_map->dma_pages[i];
if (*dma) {
+ *dma &= ~XSK_NEXT_PG_CONTIG_MASK;
dma_unmap_page_attrs(dma_map->dev, *dma, PAGE_SIZE,
DMA_BIDIRECTIONAL, attrs);
*dma = 0;