From 955d3411a17f590364238bd0d3329b61f20c1cd2 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 30 Dec 2018 12:46:01 +0100 Subject: batman-adv: Avoid WARN on net_device without parent in netns It is not allowed to use WARN* helpers on potential incorrect input from the user or transient problems because systems configured as panic_on_warn will reboot due to such a problem. A NULL return value of __dev_get_by_index can be caused by various problems which can either be related to the system configuration or problems (incorrectly returned network namespaces) in other (virtual) net_device drivers. batman-adv should not cause a (harmful) WARN in this situation and instead only report it via a simple message. Fixes: b7eddd0b3950 ("batman-adv: prevent using any virtual device created on batman-adv as hard-interface") Reported-by: syzbot+c764de0fcfadca9a8595@syzkaller.appspotmail.com Reported-by: Dmitry Vyukov Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 508f4416dfc9..415d494cbe22 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -20,7 +20,6 @@ #include "main.h" #include -#include #include #include #include @@ -179,8 +178,10 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) parent_dev = __dev_get_by_index((struct net *)parent_net, dev_get_iflink(net_dev)); /* if we got a NULL parent_dev there is something broken.. */ - if (WARN(!parent_dev, "Cannot find parent device")) + if (!parent_dev) { + pr_err("Cannot find parent device\n"); return false; + } if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net)) return false; -- cgit v1.2.3 From 9114daa825fc3f335f9bea3313ce667090187280 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 31 Dec 2018 22:31:01 +0100 Subject: batman-adv: Force mac header to start of data on xmit The caller of ndo_start_xmit may not already have called skb_reset_mac_header. The returned value of skb_mac_header/eth_hdr therefore can be in the wrong position and even outside the current skbuff. This for example happens when the user binds to the device using a PF_PACKET-SOCK_RAW with enabled qdisc-bypass: int opt = 4; setsockopt(sock, SOL_PACKET, PACKET_QDISC_BYPASS, &opt, sizeof(opt)); Since eth_hdr is used all over the codebase, the batadv_interface_tx function must always take care of resetting it. Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Reported-by: syzbot+9d7405c7faa390e60b4e@syzkaller.appspotmail.com Reported-by: syzbot+7d20bc3f1ddddc0f9079@syzkaller.appspotmail.com Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/soft-interface.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 5db5a0a4c959..b85ca809e509 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -221,6 +221,8 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, netif_trans_update(soft_iface); vid = batadv_get_vid(skb, 0); + + skb_reset_mac_header(skb); ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { -- cgit v1.2.3 From 7c1e8a3817c55d73b27cc29b84075999c8894179 Mon Sep 17 00:00:00 2001 From: Arthur Gautier Date: Mon, 31 Dec 2018 02:10:58 +0000 Subject: netlink: fixup regression in RTM_GETADDR This commit fixes a regression in AF_INET/RTM_GETADDR and AF_INET6/RTM_GETADDR. Before this commit, the kernel would stop dumping addresses once the first skb was full and end the stream with NLMSG_DONE(-EMSGSIZE). The error shouldn't be sent back to netlink_dump so the callback is kept alive. The userspace is expected to call back with a new empty skb. Changes from V1: - The error is not handled in netlink_dump anymore but rather in inet_dump_ifaddr and inet6_dump_addr directly as suggested by David Ahern. Fixes: d7e38611b81e ("net/ipv4: Put target net when address dump fails due to bad attributes") Fixes: 242afaa6968c ("net/ipv6: Put target net when address dump fails due to bad attributes") Cc: David Ahern Cc: "David S . Miller" Cc: netdev@vger.kernel.org Signed-off-by: Arthur Gautier Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 +- net/ipv6/addrconf.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 04ba321ae5ce..e258a00b4a3d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1826,7 +1826,7 @@ put_tgt_net: if (fillargs.netnsid >= 0) put_net(tgt_net); - return err < 0 ? err : skb->len; + return skb->len ? : err; } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8eeec6eb2bd3..93d5ad2b1a69 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5154,7 +5154,7 @@ put_tgt_net: if (fillargs.netnsid >= 0) put_net(tgt_net); - return err < 0 ? err : skb->len; + return skb->len ? : err; } static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) -- cgit v1.2.3 From f8c468e8537925e0c4607263f498a1b7c0c8982e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 Jan 2019 13:01:43 -0800 Subject: net, skbuff: do not prefer skb allocation fails early Commit dcda9b04713c ("mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic") replaced __GFP_REPEAT in alloc_skb_with_frags() with __GFP_RETRY_MAYFAIL when the allocation may directly reclaim. The previous behavior would require reclaim up to 1 << order pages for skb aligned header_len of order > PAGE_ALLOC_COSTLY_ORDER before failing, otherwise the allocations in alloc_skb() would loop in the page allocator looking for memory. __GFP_RETRY_MAYFAIL makes both allocations failable under memory pressure, including for the HEAD allocation. This can cause, among many other things, write() to fail with ENOTCONN during RPC when under memory pressure. These allocations should succeed as they did previous to dcda9b04713c even if it requires calling the oom killer and additional looping in the page allocator to find memory. There is no way to specify the previous behavior of __GFP_REPEAT, but it's unlikely to be necessary since the previous behavior only guaranteed that 1 << order pages would be reclaimed before failing for order > PAGE_ALLOC_COSTLY_ORDER. That reclaim is not guaranteed to be contiguous memory, so repeating for such large orders is usually not beneficial. Removing the setting of __GFP_RETRY_MAYFAIL to restore the previous behavior, specifically not allowing alloc_skb() to fail for small orders and oom kill if necessary rather than allowing RPCs to fail. Fixes: dcda9b04713c ("mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic") Signed-off-by: David Rientjes Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 37317ffec146..26d848484912 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5270,7 +5270,6 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long chunk; struct sk_buff *skb; struct page *page; - gfp_t gfp_head; int i; *errcode = -EMSGSIZE; @@ -5280,12 +5279,8 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, if (npages > MAX_SKB_FRAGS) return NULL; - gfp_head = gfp_mask; - if (gfp_head & __GFP_DIRECT_RECLAIM) - gfp_head |= __GFP_RETRY_MAYFAIL; - *errcode = -ENOBUFS; - skb = alloc_skb(header_len, gfp_head); + skb = alloc_skb(header_len, gfp_mask); if (!skb) return NULL; -- cgit v1.2.3 From 41e4e2cd75346667b0c531c07dab05cce5b06d15 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Thu, 3 Jan 2019 09:51:57 -0800 Subject: openvswitch: Fix IPv6 later frags parsing The previous commit fa642f08839b ("openvswitch: Derive IP protocol number for IPv6 later frags") introduces IP protocol number parsing for IPv6 later frags that can mess up the network header length calculation logic, i.e. nh_len < 0. However, the network header length calculation is mainly for deriving the transport layer header in the key extraction process which the later fragment does not apply. Therefore, this commit skips the network header length calculation to fix the issue. Reported-by: Chris Mi Reported-by: Greg Rose Fixes: fa642f08839b ("openvswitch: Derive IP protocol number for IPv6 later frags") Signed-off-by: Yi-Hung Wei Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 57e07768c9d1..f54cf17ef7a8 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -276,10 +276,12 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags); if (flags & IP6_FH_F_FRAG) { - if (frag_off) + if (frag_off) { key->ip.frag = OVS_FRAG_TYPE_LATER; - else - key->ip.frag = OVS_FRAG_TYPE_FIRST; + key->ip.proto = nexthdr; + return 0; + } + key->ip.frag = OVS_FRAG_TYPE_FIRST; } else { key->ip.frag = OVS_FRAG_TYPE_NONE; } -- cgit v1.2.3 From bc6e019b6ee65ff4ebf3ca272f774cf6c67db669 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 3 Jan 2019 21:43:34 +0100 Subject: fou: Prevent unbounded recursion in GUE error handler also with UDP-Lite In commit 11789039da53 ("fou: Prevent unbounded recursion in GUE error handler"), I didn't take care of the case where UDP-Lite is encapsulated into UDP or UDP-Lite with GUE. From a syzbot report about a possibly similar issue with GUE on IPv6, I just realised the same thing might happen with a UDP-Lite inner payload. Also skip exception handling for inner UDP-Lite protocol. Fixes: 11789039da53 ("fou: Prevent unbounded recursion in GUE error handler") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/fou.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 0c9f171fb085..632863541082 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -1065,7 +1065,8 @@ static int gue_err(struct sk_buff *skb, u32 info) * recursion. Besides, this kind of encapsulation can't even be * configured currently. Discard this. */ - if (guehdr->proto_ctype == IPPROTO_UDP) + if (guehdr->proto_ctype == IPPROTO_UDP || + guehdr->proto_ctype == IPPROTO_UDPLITE) return -EOPNOTSUPP; skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); -- cgit v1.2.3 From 44039e00171b0fe930c07ff7b43e6023eaf1ed31 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 3 Jan 2019 21:43:35 +0100 Subject: fou6: Prevent unbounded recursion in GUE error handler I forgot to deal with IPv6 in commit 11789039da53 ("fou: Prevent unbounded recursion in GUE error handler"). Now syzbot reported what might be the same type of issue, caused by gue6_err(), that is, handling exceptions for direct UDP encapsulation in GUE (UDP-in-UDP) leads to unbounded recursion in the GUE exception handler. As it probably doesn't make sense to set up GUE this way, and it's currently not even possible to configure this, skip exception handling for UDP (or UDP-Lite) packets encapsulated in UDP (or UDP-Lite) packets with GUE on IPv6. Reported-by: syzbot+4ad25edc7a33e4ab91e0@syzkaller.appspotmail.com Reported-by: Willem de Bruijn Reported-by: Eric Dumazet Fixes: b8a51b38e4d4 ("fou, fou6: ICMP error handlers for FoU and GUE") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/fou6.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c index bd675c61deb1..7da7bf3b7fe3 100644 --- a/net/ipv6/fou6.c +++ b/net/ipv6/fou6.c @@ -131,6 +131,14 @@ static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (validate_gue_flags(guehdr, optlen)) return -EINVAL; + /* Handling exceptions for direct UDP encapsulation in GUE would lead to + * recursion. Besides, this kind of encapsulation can't even be + * configured currently. Discard this. + */ + if (guehdr->proto_ctype == IPPROTO_UDP || + guehdr->proto_ctype == IPPROTO_UDPLITE) + return -EOPNOTSUPP; + skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr)); ret = gue6_err_proto_handler(guehdr->proto_ctype, skb, opt, type, code, offset, info); -- cgit v1.2.3 From 8d933670452107e41165bea70a30dffbd281bef1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Jan 2019 11:00:00 -0800 Subject: ipv6: make icmp6_send() robust against null skb->dev syzbot was able to crash one host with the following stack trace : kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 8625 Comm: syz-executor4 Not tainted 4.20.0+ #8 RIP: 0010:dev_net include/linux/netdevice.h:2169 [inline] RIP: 0010:icmp6_send+0x116/0x2d30 net/ipv6/icmp.c:426 icmpv6_send smack_socket_sock_rcv_skb security_sock_rcv_skb sk_filter_trim_cap __sk_receive_skb dccp_v6_do_rcv release_sock This is because a RX packet found socket owned by user and was stored into socket backlog. Before leaving RCU protected section, skb->dev was cleared in __sk_receive_skb(). When socket backlog was finally handled at release_sock() time, skb was fed to smack_socket_sock_rcv_skb() then icmp6_send() We could fix the bug in smack_socket_sock_rcv_skb(), or simply make icmp6_send() more robust against such possibility. In the future we might provide to icmp6_send() the net pointer instead of infering it. Fixes: d66a8acbda92 ("Smack: Inform peer that IPv6 traffic has been blocked") Signed-off-by: Eric Dumazet Cc: Piotr Sawicki Cc: Casey Schaufler Reported-by: syzbot Acked-by: Casey Schaufler Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 5d7aa2c2770c..bbcdfd299692 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -423,10 +423,10 @@ static int icmp6_iif(const struct sk_buff *skb) static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr) { - struct net *net = dev_net(skb->dev); struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); struct sock *sk; + struct net *net; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; struct dst_entry *dst; @@ -437,12 +437,16 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, int iif = 0; int addr_type = 0; int len; - u32 mark = IP6_REPLY_MARK(net, skb->mark); + u32 mark; if ((u8 *)hdr < skb->head || (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb)) return; + if (!skb->dev) + return; + net = dev_net(skb->dev); + mark = IP6_REPLY_MARK(net, skb->mark); /* * Make sure we respect the rules * i.e. RFC 1885 2.4(e) -- cgit v1.2.3 From ec90ad334986fa5856d11dd272f7f22fa86c55c4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 4 Jan 2019 16:58:15 -0800 Subject: ipv6: Consider sk_bound_dev_if when binding a socket to a v4 mapped address Similar to c5ee066333eb ("ipv6: Consider sk_bound_dev_if when binding a socket to an address"), binding a socket to v4 mapped addresses needs to consider if the socket is bound to a device. This problem also exists from the beginning of git history. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 0bfb6cc0a30a..93288b9f1697 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -310,6 +310,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { + struct net_device *dev = NULL; int chk_addr_ret; /* Binding to v4-mapped address on a v6-only socket @@ -320,9 +321,17 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, goto out; } + if (sk->sk_bound_dev_if) { + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + if (!dev) { + err = -ENODEV; + goto out; + } + } + /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; - chk_addr_ret = inet_addr_type(net, v4addr); + chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr); if (!inet_can_nonlocal_bind(net, inet) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && -- cgit v1.2.3 From e8e36984080b55ac5e57bdb09a5b570f2fc8e963 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 4 Jan 2019 01:07:07 -0800 Subject: bpf: Fix [::] -> [::1] rewrite in sys_sendmsg sys_sendmsg has supported unspecified destination IPv6 (wildcard) for unconnected UDP sockets since 876c7f41. When [::] is passed by user as destination, sys_sendmsg rewrites it with [::1] to be consistent with BSD (see "BSD'ism" comment in the code). This didn't work when cgroup-bpf was enabled though since the rewrite [::] -> [::1] happened before passing control to cgroup-bpf block where fl6.daddr was updated with passed by user sockaddr_in6.sin6_addr (that might or might not be changed by BPF program). That way if user passed [::] as dst IPv6 it was first rewritten with [::1] by original code from 876c7f41, but then rewritten back with [::] by cgroup-bpf block. It happened even when BPF_CGROUP_UDP6_SENDMSG program was not present (CONFIG_CGROUP_BPF=y was enough). The fix is to apply BSD'ism after cgroup-bpf block so that [::] is replaced with [::1] no matter where it came from: passed by user to sys_sendmsg or set by BPF_CGROUP_UDP6_SENDMSG program. Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") Reported-by: Nitin Rawat Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- net/ipv6/udp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9cbf363172bd..7c3505006f8e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1390,10 +1390,7 @@ do_udp_sendmsg: ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; - if (!ipv6_addr_any(daddr)) - fl6.daddr = *daddr; - else - fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + fl6.daddr = *daddr; if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; @@ -1421,6 +1418,9 @@ do_udp_sendmsg: } } + if (ipv6_addr_any(&fl6.daddr)) + fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + final_p = fl6_update_dst(&fl6, opt, &final); if (final_p) connected = false; -- cgit v1.2.3 From d4a7e9bb74b5aaf07b89f6531c080b1130bdf019 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 5 Jan 2019 07:35:04 -0800 Subject: ipv6: Take rcu_read_lock in __inet6_bind for mapped addresses I realized the last patch calls dev_get_by_index_rcu in a branch not holding the rcu lock. Add the calls to rcu_read_lock and rcu_read_unlock. Fixes: ec90ad334986 ("ipv6: Consider sk_bound_dev_if when binding a socket to a v4 mapped address") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 93288b9f1697..d99753b5e39b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -321,17 +321,20 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, goto out; } + rcu_read_lock(); if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV; - goto out; + goto out_unlock; } } /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr); + rcu_read_unlock(); + if (!inet_can_nonlocal_bind(net, inet) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && -- cgit v1.2.3 From 0aaa81377c5a01f686bcdb8c7a6929a7bf330c68 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 4 Jan 2019 15:55:26 +0100 Subject: can: gw: ensure DLC boundaries after CAN frame modification Muyu Yu provided a POC where user root with CAP_NET_ADMIN can create a CAN frame modification rule that makes the data length code a higher value than the available CAN frame data size. In combination with a configured checksum calculation where the result is stored relatively to the end of the data (e.g. cgw_csum_xor_rel) the tail of the skb (e.g. frag_list pointer in skb_shared_info) can be rewritten which finally can cause a system crash. Michael Kubecek suggested to drop frames that have a DLC exceeding the available space after the modification process and provided a patch that can handle CAN FD frames too. Within this patch we also limit the length for the checksum calculations to the maximum of Classic CAN data length (8). CAN frames that are dropped by these additional checks are counted with the CGW_DELETED counter which indicates misconfigurations in can-gw rules. This fixes CVE-2019-3701. Reported-by: Muyu Yu Reported-by: Marcus Meissner Suggested-by: Michal Kubecek Tested-by: Muyu Yu Tested-by: Oliver Hartkopp Signed-off-by: Oliver Hartkopp Cc: linux-stable # >= v3.2 Signed-off-by: Marc Kleine-Budde Signed-off-by: David S. Miller --- net/can/gw.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/can/gw.c b/net/can/gw.c index faa3da88a127..53859346dc9a 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -416,13 +416,29 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx]) (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod); - /* check for checksum updates when the CAN frame has been modified */ + /* Has the CAN frame been modified? */ if (modidx) { - if (gwj->mod.csumfunc.crc8) + /* get available space for the processed CAN frame type */ + int max_len = nskb->len - offsetof(struct can_frame, data); + + /* dlc may have changed, make sure it fits to the CAN frame */ + if (cf->can_dlc > max_len) + goto out_delete; + + /* check for checksum updates in classic CAN length only */ + if (gwj->mod.csumfunc.crc8) { + if (cf->can_dlc > 8) + goto out_delete; + (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); + } + + if (gwj->mod.csumfunc.xor) { + if (cf->can_dlc > 8) + goto out_delete; - if (gwj->mod.csumfunc.xor) (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor); + } } /* clear the skb timestamp if not configured the other way */ @@ -434,6 +450,14 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) gwj->dropped_frames++; else gwj->handled_frames++; + + return; + + out_delete: + /* delete frame due to misconfiguration */ + gwj->deleted_frames++; + kfree_skb(nskb); + return; } static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj) -- cgit v1.2.3 From eeb2c4fb6a3d0ebed35fbc13a255f691c8b8d7e5 Mon Sep 17 00:00:00 2001 From: Jacob Wen Date: Mon, 7 Jan 2019 09:59:59 +0800 Subject: rds: use DIV_ROUND_UP instead of ceil Yes indeed, DIV_ROUND_UP is in kernel.h. Signed-off-by: Jacob Wen Signed-off-by: David S. Miller --- net/rds/ib_send.c | 4 ++-- net/rds/message.c | 4 ++-- net/rds/rds.h | 4 ---- net/rds/send.c | 2 +- 4 files changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 2dcb555e6350..4e0c36acf866 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -522,7 +522,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) i = 1; else - i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); + i = DIV_ROUND_UP(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc == 0) { @@ -879,7 +879,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) * Instead of knowing how to return a partial rdma read/write we insist that there * be enough work requests to send the entire message. */ - i = ceil(op->op_count, max_sge); + i = DIV_ROUND_UP(op->op_count, max_sge); work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc != i) { diff --git a/net/rds/message.c b/net/rds/message.c index f139420ba1f6..50f13f1d4ae0 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -341,7 +341,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in { struct rds_message *rm; unsigned int i; - int num_sgs = ceil(total_len, PAGE_SIZE); + int num_sgs = DIV_ROUND_UP(total_len, PAGE_SIZE); int extra_bytes = num_sgs * sizeof(struct scatterlist); int ret; @@ -351,7 +351,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); - rm->data.op_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_nents = DIV_ROUND_UP(total_len, PAGE_SIZE); rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs, &ret); if (!rm->data.op_sg) { rds_message_put(rm); diff --git a/net/rds/rds.h b/net/rds/rds.h index 02ec4a3b2799..4ffe100ff5e6 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -48,10 +48,6 @@ void rdsdebug(char *fmt, ...) } #endif -/* XXX is there one of these somewhere? */ -#define ceil(x, y) \ - ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; }) - #define RDS_FRAG_SHIFT 12 #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) diff --git a/net/rds/send.c b/net/rds/send.c index 3d822bad7de9..fd8b687d5c05 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1107,7 +1107,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) size_t total_payload_len = payload_len, rdma_payload_len = 0; bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); - int num_sgs = ceil(payload_len, PAGE_SIZE); + int num_sgs = DIV_ROUND_UP(payload_len, PAGE_SIZE); int namelen; struct rds_iov_vector_arr vct; int ind; -- cgit v1.2.3 From f87d8ad9233f115db92c6c087d58403b0009ed36 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 5 Jan 2019 10:52:23 -0600 Subject: tipc: fix memory leak in tipc_nl_compat_publ_dump There is a memory leak in case genlmsg_put fails. Fix this by freeing *args* before return. Addresses-Coverity-ID: 1476406 ("Resource leak") Fixes: 46273cf7e009 ("tipc: fix a missing check of genlmsg_put") Signed-off-by: Gustavo A. R. Silva Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 40f5cae623a7..77e4b2418f30 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -904,8 +904,10 @@ static int tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, u32 sock) hdr = genlmsg_put(args, 0, 0, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_PUBL_GET); - if (!hdr) + if (!hdr) { + kfree_skb(args); return -EMSGSIZE; + } nest = nla_nest_start(args, TIPC_NLA_SOCK); if (!nest) { -- cgit v1.2.3 From 4c84edc11b76590859b1e45dd676074c59602dc4 Mon Sep 17 00:00:00 2001 From: JianJhen Chen Date: Sun, 6 Jan 2019 11:28:13 +0800 Subject: net: bridge: fix a bug on using a neighbour cache entry without checking its state When handling DNAT'ed packets on a bridge device, the neighbour cache entry from lookup was used without checking its state. It means that a cache entry in the NUD_STALE state will be used directly instead of entering the NUD_DELAY state to confirm the reachability of the neighbor. This problem becomes worse after commit 2724680bceee ("neigh: Keep neighbour cache entries if number of them is small enough."), since all neighbour cache entries in the NUD_STALE state will be kept in the neighbour table as long as the number of cache entries does not exceed the value specified in gc_thresh1. This commit validates the state of a neighbour cache entry before using the entry. Signed-off-by: JianJhen Chen Reviewed-by: JinLin Chen Signed-off-by: David S. Miller --- net/bridge/br_netfilter_hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index d21a23698410..c93c35bb73dd 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -265,7 +265,7 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); int ret; - if (neigh->hh.hh_len) { + if ((neigh->nud_state & NUD_CONNECTED) && neigh->hh.hh_len) { neigh_hh_bridge(&neigh->hh, skb); skb->dev = nf_bridge->physindev; ret = br_handle_frame_finish(net, sk, skb); -- cgit v1.2.3 From 26d92e951fe0a44ee4aec157cabb65a818cc8151 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 5 Jan 2019 23:45:26 -0800 Subject: smc: move unhash as early as possible in smc_release() In smc_release() we release smc->clcsock before unhash the smc sock, but a parallel smc_diag_dump() may be still reading smc->clcsock, therefore this could cause a use-after-free as reported by syzbot. Reported-and-tested-by: syzbot+fbd1e5476e4c94c7b34e@syzkaller.appspotmail.com Fixes: 51f1de79ad8e ("net/smc: replace sock_put worker by socket refcounting") Cc: Ursula Braun Signed-off-by: Cong Wang Reported-by: syzbot+0bf2e01269f1274b4b03@syzkaller.appspotmail.com Reported-by: syzbot+e3132895630f957306bc@syzkaller.appspotmail.com Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c4da4a78d369..c4e56602e0c6 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -146,6 +146,9 @@ static int smc_release(struct socket *sock) sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } + + sk->sk_prot->unhash(sk); + if (smc->clcsock) { if (smc->use_fallback && sk->sk_state == SMC_LISTEN) { /* wake up clcsock accept */ @@ -170,7 +173,6 @@ static int smc_release(struct socket *sock) smc_conn_free(&smc->conn); release_sock(sk); - sk->sk_prot->unhash(sk); sock_put(sk); /* final sock_put */ out: return rc; -- cgit v1.2.3 From 02b2f549d502b46e68b97ea1452fb8853b3327dd Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 18 Dec 2018 04:31:48 -0500 Subject: libceph: allow setting abort_on_full for rbd Introduce a new option abort_on_full, default to false. Then we can get -ENOSPC when the pool is full, or reaches quota. [ Don't show abort_on_full in /proc/mounts. ] Signed-off-by: Dongsheng Yang Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 4 ++-- include/linux/ceph/libceph.h | 6 ++++-- include/linux/ceph/osd_client.h | 1 - net/ceph/ceph_common.c | 11 ++++++++++- net/ceph/debugfs.c | 2 +- net/ceph/osd_client.c | 4 ++-- 6 files changed, 19 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4e9a7cc488da..da2cd8e89062 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -530,7 +530,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_putc(m, ','); pos = m->count; - ret = ceph_print_client_options(m, fsc->client); + ret = ceph_print_client_options(m, fsc->client, false); if (ret) return ret; @@ -640,7 +640,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, opt = NULL; /* fsc->client now owns this */ fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->osdc.abort_on_full = true; + ceph_set_opt(fsc->client, ABORT_ON_FULL); if (!fsopt->mds_namespace) { ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 68bb09c29ce8..a420c07904bc 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -35,6 +35,7 @@ #define CEPH_OPT_NOMSGAUTH (1<<4) /* don't require msg signing feat */ #define CEPH_OPT_TCP_NODELAY (1<<5) /* TCP_NODELAY on TCP sockets */ #define CEPH_OPT_NOMSGSIGN (1<<6) /* don't sign msgs */ +#define CEPH_OPT_ABORT_ON_FULL (1<<7) /* abort w/ ENOSPC when full */ #define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) @@ -53,7 +54,7 @@ struct ceph_options { unsigned long osd_request_timeout; /* jiffies */ /* - * any type that can't be simply compared or doesn't need need + * any type that can't be simply compared or doesn't need * to be compared should go beyond this point, * ceph_compare_options() should be updated accordingly */ @@ -281,7 +282,8 @@ extern struct ceph_options *ceph_parse_options(char *options, const char *dev_name, const char *dev_name_end, int (*parse_extra_token)(char *c, void *private), void *private); -int ceph_print_client_options(struct seq_file *m, struct ceph_client *client); +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, + bool show_all); extern void ceph_destroy_options(struct ceph_options *opt); extern int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 7a2af5034278..2294f963dab7 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -354,7 +354,6 @@ struct ceph_osd_client { struct rb_root linger_map_checks; atomic_t num_requests; atomic_t num_homeless; - bool abort_on_full; /* abort w/ ENOSPC when full */ int abort_err; struct delayed_work timeout_work; struct delayed_work osds_timeout_work; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 87afb9ec4c68..9cab80207ced 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -255,6 +255,7 @@ enum { Opt_nocephx_sign_messages, Opt_tcp_nodelay, Opt_notcp_nodelay, + Opt_abort_on_full, }; static match_table_t opt_tokens = { @@ -280,6 +281,7 @@ static match_table_t opt_tokens = { {Opt_nocephx_sign_messages, "nocephx_sign_messages"}, {Opt_tcp_nodelay, "tcp_nodelay"}, {Opt_notcp_nodelay, "notcp_nodelay"}, + {Opt_abort_on_full, "abort_on_full"}, {-1, NULL} }; @@ -535,6 +537,10 @@ ceph_parse_options(char *options, const char *dev_name, opt->flags &= ~CEPH_OPT_TCP_NODELAY; break; + case Opt_abort_on_full: + opt->flags |= CEPH_OPT_ABORT_ON_FULL; + break; + default: BUG_ON(token); } @@ -549,7 +555,8 @@ out: } EXPORT_SYMBOL(ceph_parse_options); -int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, + bool show_all) { struct ceph_options *opt = client->options; size_t pos = m->count; @@ -574,6 +581,8 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) seq_puts(m, "nocephx_sign_messages,"); if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) seq_puts(m, "notcp_nodelay,"); + if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL)) + seq_puts(m, "abort_on_full,"); if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) seq_printf(m, "mount_timeout=%d,", diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 02952605d121..46f65709a6ff 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -375,7 +375,7 @@ static int client_options_show(struct seq_file *s, void *p) struct ceph_client *client = s->private; int ret; - ret = ceph_print_client_options(s, client); + ret = ceph_print_client_options(s, client, true); if (ret) return ret; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d23a9f81f3d7..fa9530dd876e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2315,7 +2315,7 @@ again: (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { dout("req %p full/pool_full\n", req); - if (osdc->abort_on_full) { + if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) { err = -ENOSPC; } else { pr_warn_ratelimited("FULL or reached pool quota\n"); @@ -2545,7 +2545,7 @@ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) { bool victims = false; - if (osdc->abort_on_full && + if (ceph_test_opt(osdc->client, ABORT_ON_FULL) && (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc))) for_each_request(osdc, abort_on_full_fn, &victims); } -- cgit v1.2.3 From 4429b668e0375206408617d6440e3bb76c56c7d2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 5 Jan 2019 16:06:07 +0300 Subject: xprtrdma: Fix error code in rpcrdma_buffer_create() This should return -ENOMEM if __alloc_workqueue_key() fails, but it returns success. Fixes: 6d2d0ee27c7a ("xprtrdma: Replace rpcrdma_receive_wq with a per-xprt workqueue") Signed-off-by: Dan Carpenter Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 7749a2bf6887..3dde05892c8e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1113,8 +1113,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) WQ_MEM_RECLAIM | WQ_HIGHPRI, 0, r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]); - if (!buf->rb_completion_wq) + if (!buf->rb_completion_wq) { + rc = -ENOMEM; goto out; + } return 0; out: -- cgit v1.2.3 From 6e17f58c486d9554341f70aa5b63b8fbed07b3fa Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 5 Jan 2019 16:06:48 +0300 Subject: xprtrdma: Double free in rpcrdma_sendctxs_create() The clean up is handled by the caller, rpcrdma_buffer_create(), so this call to rpcrdma_sendctxs_destroy() leads to a double free. Fixes: ae72950abf99 ("xprtrdma: Add data structure to manage RDMA Send arguments") Signed-off-by: Dan Carpenter Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3dde05892c8e..4994e75945b8 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -845,17 +845,13 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) for (i = 0; i <= buf->rb_sc_last; i++) { sc = rpcrdma_sendctx_create(&r_xprt->rx_ia); if (!sc) - goto out_destroy; + return -ENOMEM; sc->sc_xprt = r_xprt; buf->rb_sc_ctxs[i] = sc; } return 0; - -out_destroy: - rpcrdma_sendctxs_destroy(buf); - return -ENOMEM; } /* The sendctx queue is not guaranteed to have a size that is a -- cgit v1.2.3 From 6a829eb8619fbdde6d7d627ad582fe119805f39d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 3 Jan 2019 09:04:45 -0500 Subject: SUNRPC: Fix TCP receive code on archs with flush_dcache_page() After receiving data into the page cache, we need to call flush_dcache_page() for the architectures that define it. Fixes: 277e4ab7d530b ("SUNRPC: Simplify TCP receive code by switching...") Reported-by: Geert Uytterhoeven Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.20 Tested-by: Geert Uytterhoeven Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 13559e6a460b..7754aa3e434f 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -376,6 +377,26 @@ xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, return sock_recvmsg(sock, msg, flags); } +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +static void +xs_flush_bvec(const struct bio_vec *bvec, size_t count, size_t seek) +{ + struct bvec_iter bi = { + .bi_size = count, + }; + struct bio_vec bv; + + bvec_iter_advance(bvec, &bi, seek & PAGE_MASK); + for_each_bvec(bv, bvec, bi, bi) + flush_dcache_page(bv.bv_page); +} +#else +static inline void +xs_flush_bvec(const struct bio_vec *bvec, size_t count, size_t seek) +{ +} +#endif + static ssize_t xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, struct xdr_buf *buf, size_t count, size_t seek, size_t *read) @@ -409,6 +430,7 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, seek + buf->page_base); if (ret <= 0) goto sock_err; + xs_flush_bvec(buf->bvec, ret, seek + buf->page_base); offset += ret - buf->page_base; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; -- cgit v1.2.3 From 279737939a8194f02fa352ab4476a1b241f44ef4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Jan 2019 16:48:11 +0000 Subject: net: bridge: Fix VLANs memory leak When adding / deleting VLANs to / from a bridge port, the bridge driver first tries to propagate the information via switchdev and falls back to the 8021q driver in case the underlying driver does not support switchdev. This can result in a memory leak [1] when VXLAN and mlxsw ports are enslaved to the bridge: $ ip link set dev vxlan0 master br0 # No mlxsw ports are enslaved to 'br0', so mlxsw ignores the switchdev # notification and the bridge driver adds the VLAN on 'vxlan0' via the # 8021q driver $ bridge vlan add vid 10 dev vxlan0 pvid untagged # mlxsw port is enslaved to the bridge $ ip link set dev swp1 master br0 # mlxsw processes the switchdev notification and the 8021q driver is # skipped $ bridge vlan del vid 10 dev vxlan0 This results in 'struct vlan_info' and 'struct vlan_vid_info' being leaked, as they were allocated by the 8021q driver during VLAN addition, but never freed as the 8021q driver was skipped during deletion. Fix this by introducing a new VLAN private flag that indicates whether the VLAN was added on the port by switchdev or the 8021q driver. If the VLAN was added by the 8021q driver, then we make sure to delete it via the 8021q driver as well. [1] unreferenced object 0xffff88822d20b1e8 (size 256): comm "bridge", pid 2532, jiffies 4295216998 (age 1188.830s) hex dump (first 32 bytes): e0 42 97 ce 81 88 ff ff 00 00 00 00 00 00 00 00 .B.............. 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f82d851d>] kmem_cache_alloc_trace+0x1be/0x330 [<00000000e0178b02>] vlan_vid_add+0x661/0x920 [<00000000218ebd5f>] __vlan_add+0x1be9/0x3a00 [<000000006eafa1ca>] nbp_vlan_add+0x8b3/0xd90 [<000000003535392c>] br_vlan_info+0x132/0x410 [<00000000aedaa9dc>] br_afspec+0x75c/0x870 [<00000000f5716133>] br_setlink+0x3dc/0x6d0 [<00000000aceca5e2>] rtnl_bridge_setlink+0x615/0xb30 [<00000000a2f2d23e>] rtnetlink_rcv_msg+0x3a3/0xa80 [<0000000064097e69>] netlink_rcv_skb+0x152/0x3c0 [<000000008be8d614>] rtnetlink_rcv+0x21/0x30 [<000000009ab2ca25>] netlink_unicast+0x52f/0x740 [<00000000e7d9ac96>] netlink_sendmsg+0x9c7/0xf50 [<000000005d1e2050>] sock_sendmsg+0xbe/0x120 [<00000000d51426bc>] ___sys_sendmsg+0x778/0x8f0 [<00000000b9d7b2cc>] __sys_sendmsg+0x112/0x270 unreferenced object 0xffff888227454308 (size 32): comm "bridge", pid 2532, jiffies 4295216998 (age 1188.882s) hex dump (first 32 bytes): 88 b2 20 2d 82 88 ff ff 88 b2 20 2d 82 88 ff ff .. -...... -.... 81 00 0a 00 01 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f82d851d>] kmem_cache_alloc_trace+0x1be/0x330 [<0000000018050631>] vlan_vid_add+0x3e6/0x920 [<00000000218ebd5f>] __vlan_add+0x1be9/0x3a00 [<000000006eafa1ca>] nbp_vlan_add+0x8b3/0xd90 [<000000003535392c>] br_vlan_info+0x132/0x410 [<00000000aedaa9dc>] br_afspec+0x75c/0x870 [<00000000f5716133>] br_setlink+0x3dc/0x6d0 [<00000000aceca5e2>] rtnl_bridge_setlink+0x615/0xb30 [<00000000a2f2d23e>] rtnetlink_rcv_msg+0x3a3/0xa80 [<0000000064097e69>] netlink_rcv_skb+0x152/0x3c0 [<000000008be8d614>] rtnetlink_rcv+0x21/0x30 [<000000009ab2ca25>] netlink_unicast+0x52f/0x740 [<00000000e7d9ac96>] netlink_sendmsg+0x9c7/0xf50 [<000000005d1e2050>] sock_sendmsg+0xbe/0x120 [<00000000d51426bc>] ___sys_sendmsg+0x778/0x8f0 [<00000000b9d7b2cc>] __sys_sendmsg+0x112/0x270 Fixes: d70e42b22dd4 ("mlxsw: spectrum: Enable VxLAN enslavement to VLAN-aware bridges") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Cc: Roopa Prabhu Cc: Nikolay Aleksandrov Cc: bridge@lists.linux-foundation.org Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 1 + net/bridge/br_vlan.c | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d240b3e7919f..eabf8bf28a3f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -107,6 +107,7 @@ struct br_tunnel_info { /* private vlan flags */ enum { BR_VLFLAG_PER_PORT_STATS = BIT(0), + BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1), }; /** diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 4a2f31157ef5..96abf8feb9dc 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -80,16 +80,18 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags) } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, - u16 vid, u16 flags, struct netlink_ext_ack *extack) + struct net_bridge_vlan *v, u16 flags, + struct netlink_ext_ack *extack) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ - err = br_switchdev_port_vlan_add(dev, vid, flags, extack); + err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack); if (err == -EOPNOTSUPP) - return vlan_vid_add(dev, br->vlan_proto, vid); + return vlan_vid_add(dev, br->vlan_proto, v->vid); + v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV; return err; } @@ -121,19 +123,17 @@ static void __vlan_del_list(struct net_bridge_vlan *v) } static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, - u16 vid) + const struct net_bridge_vlan *v) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q del. */ - err = br_switchdev_port_vlan_del(dev, vid); - if (err == -EOPNOTSUPP) { - vlan_vid_del(dev, br->vlan_proto, vid); - return 0; - } - return err; + err = br_switchdev_port_vlan_del(dev, v->vid); + if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)) + vlan_vid_del(dev, br->vlan_proto, v->vid); + return err == -EOPNOTSUPP ? 0 : err; } /* Returns a master vlan, if it didn't exist it gets created. In all cases a @@ -242,7 +242,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ - err = __vlan_vid_add(dev, br, v->vid, flags, extack); + err = __vlan_vid_add(dev, br, v, flags, extack); if (err) goto out; @@ -305,7 +305,7 @@ out_fdb_insert: out_filt: if (p) { - __vlan_vid_del(dev, br, v->vid); + __vlan_vid_del(dev, br, v); if (masterv) { if (v->stats && masterv->stats != v->stats) free_percpu(v->stats); @@ -338,7 +338,7 @@ static int __vlan_del(struct net_bridge_vlan *v) __vlan_delete_pvid(vg, v->vid); if (p) { - err = __vlan_vid_del(p->dev, p->br, v->vid); + err = __vlan_vid_del(p->dev, p->br, v); if (err) goto out; } else { -- cgit v1.2.3 From 310529e663ed975d564cf029f878583e70c3b8a3 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sun, 30 Dec 2018 17:27:43 +0100 Subject: netfilter: nf_tables: Fix for endless loop when dumping ruleset __nf_tables_dump_rules() stores the current idx value into cb->args[0] before returning to caller. With multiple chains present, cb->args[0] is therefore updated after each chain's rules have been traversed. This though causes the final nf_tables_dump_rules() run (which should return an skb->len of zero since no rules are left to dump) to continue dumping rules for each but the first chain. Fix this by moving the cb->args[0] update to nf_tables_dump_rules(). With no final action to be performed anymore in __nf_tables_dump_rules(), drop 'out_unfinished' jump label and 'rc' variable - instead return the appropriate value directly. Fixes: 241faeceb849c ("netfilter: nf_tables: Speed up selective rule dumps") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2b0a93300dd7..e3ddd8e95e58 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2304,7 +2304,6 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, struct net *net = sock_net(skb->sk); unsigned int s_idx = cb->args[0]; const struct nft_rule *rule; - int rc = 1; list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_is_active(net, rule)) @@ -2321,16 +2320,13 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, NLM_F_MULTI | NLM_F_APPEND, table->family, table, chain, rule) < 0) - goto out_unfinished; + return 1; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: (*idx)++; } - rc = 0; -out_unfinished: - cb->args[0] = *idx; - return rc; + return 0; } static int nf_tables_dump_rules(struct sk_buff *skb, @@ -2382,6 +2378,8 @@ static int nf_tables_dump_rules(struct sk_buff *skb, } done: rcu_read_unlock(); + + cb->args[0] = idx; return skb->len; } -- cgit v1.2.3 From b91d9036883793122cf6575ca4dfbfbdd201a83d Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 4 Jan 2019 17:56:16 +0900 Subject: netfilter: nf_tables: fix leaking object reference count There is no code that decreases the reference count of stateful objects in error path of the nft_add_set_elem(). this causes a leak of reference count of stateful objects. Test commands: $nft add table ip filter $nft add counter ip filter c1 $nft add map ip filter m1 { type ipv4_addr : counter \;} $nft add element ip filter m1 { 1 : c1 } $nft add element ip filter m1 { 1 : c1 } $nft delete element ip filter m1 { 1 } $nft delete counter ip filter c1 Result: Error: Could not process rule: Device or resource busy delete counter ip filter c1 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ At the second 'nft add element ip filter m1 { 1 : c1 }', the reference count of the 'c1' is increased then it tries to insert into the 'm1'. but the 'm1' already has same element so it returns -EEXIST. But it doesn't decrease the reference count of the 'c1' in the error path. Due to a leak of the reference count of the 'c1', the 'c1' can't be removed by 'nft delete counter ip filter c1'. Fixes: 8aeff920dcc9 ("netfilter: nf_tables: add stateful object reference to set elements") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e3ddd8e95e58..dcea979423bc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4506,6 +4506,8 @@ err6: err5: kfree(trans); err4: + if (obj) + obj->use--; kfree(elem.priv); err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) -- cgit v1.2.3 From 715849ab31f8e57bbad84cc6c38912aeba6beb21 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 8 Jan 2019 23:18:58 +0100 Subject: netfilter: nf_tables: selective rule dump needs table to be specified Table needs to be specified for selective rule dumps per chain. Fixes: 241faeceb849c ("netfilter: nf_tables: Speed up selective rule dumps") Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index dcea979423bc..fb07f6cfc719 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2350,7 +2350,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0) continue; - if (ctx && ctx->chain) { + if (ctx && ctx->table && ctx->chain) { struct rhlist_head *list, *tmp; list = rhltable_lookup(&table->chains_ht, ctx->chain, -- cgit v1.2.3 From d972f3dce8d161e2142da0ab1ef25df00e2f21a9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Jan 2019 23:27:06 +0000 Subject: packet: Do not leak dev refcounts on error exit 'dev' is non NULL when the addr_len check triggers so it must goto a label that does the dev_put otherwise dev will have a leaked refcount. This bug causes the ib_ipoib module to become unloadable when using systemd-network as it triggers this check on InfiniBand links. Fixes: 99137b7888f4 ("packet: validate address length") Reported-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index eedacdebcd4c..d0945253f43b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2628,7 +2628,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) addr = saddr->sll_halen ? saddr->sll_addr : NULL; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); if (addr && dev && saddr->sll_halen < dev->addr_len) - goto out; + goto out_put; } err = -ENXIO; @@ -2828,7 +2828,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) addr = saddr->sll_halen ? saddr->sll_addr : NULL; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); if (addr && dev && saddr->sll_halen < dev->addr_len) - goto out; + goto out_unlock; } err = -ENXIO; -- cgit v1.2.3 From 355b00d1e14051c13aea48c1c5430c486fed2d7a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 4 Jan 2019 14:17:00 +0100 Subject: xfrm: policy: use hlist rcu variants on inexact insert, part 2 This function was modeled on the 'exact' insert one, which did not use the rcu variant either. When I fixed the 'exact' insert I forgot to propagate this to my development tree, so the inexact variant retained the bug. Fixes: 9cf545ebd591d ("xfrm: policy: store inexact policies in a tree ordered by destination address") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 934492bad8e0..628b389af2ba 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -856,9 +856,9 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net, } if (newpos) - hlist_add_behind(&policy->bydst, newpos); + hlist_add_behind_rcu(&policy->bydst, newpos); else - hlist_add_head(&policy->bydst, &n->hhead); + hlist_add_head_rcu(&policy->bydst, &n->hhead); /* paranoia checks follow. * Check that the reinserted policy matches at least -- cgit v1.2.3 From 7a474c36586f4277f930ab7e6865c97e44dfc3bc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 4 Jan 2019 14:17:01 +0100 Subject: xfrm: policy: increment xfrm_hash_generation on hash rebuild Hash rebuild will re-set all the inexact entries, then re-insert them. Lookups that can occur in parallel will therefore not find any policies. This was safe when lookups were still guarded by rwlock. After rcu-ification, lookups check the hash_generation seqcount to detect when a hash resize takes place. Hash rebuild missed the needed increment. Hash resizes and hash rebuilds cannot occur in parallel (both acquire hash_resize_mutex), so just increment xfrm_hash_generation, like resize. Fixes: a7c44247f704e3 ("xfrm: policy: make xfrm_policy_lookup_bytype lockless") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 628b389af2ba..d8fba27a4bfb 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1235,6 +1235,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); + write_seqcount_begin(&xfrm_policy_hash_generation); /* make sure that we can insert the indirect policies again before * we start with destructive action. @@ -1334,6 +1335,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) out_unlock: __xfrm_policy_inexact_flush(net); + write_seqcount_end(&xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); -- cgit v1.2.3 From 1548bc4e0512700cf757192c106b3a20ab639223 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 4 Jan 2019 14:17:02 +0100 Subject: xfrm: policy: delete inexact policies from inexact list on hash rebuild An xfrm hash rebuild has to reset the inexact policy list before the policies get re-inserted: A change of hash thresholds will result in policies to get moved from inexact tree to the policy hash table. If the thresholds are increased again later, they get moved from hash table to inexact tree. We must unlink all policies from the inexact tree before re-insertion. Otherwise 'migrate' may find policies that are in main hash table a second time, when it searches the inexact lists. Furthermore, re-insertion without deletion can cause elements ->next to point back to itself, causing soft lockups or double-frees. Reported-by: syzbot+9d971dd21eb26567036b@syzkaller.appspotmail.com Fixes: 9cf545ebd591da ("xfrm: policy: store inexact policies in a tree ordered by destination address") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d8fba27a4bfb..24dfd1e47cf0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -680,16 +680,6 @@ static void xfrm_hash_resize(struct work_struct *work) mutex_unlock(&hash_resize_mutex); } -static void xfrm_hash_reset_inexact_table(struct net *net) -{ - struct xfrm_pol_inexact_bin *b; - - lockdep_assert_held(&net->xfrm.xfrm_policy_lock); - - list_for_each_entry(b, &net->xfrm.inexact_bins, inexact_bins) - INIT_HLIST_HEAD(&b->hhead); -} - /* Make sure *pol can be inserted into fastbin. * Useful to check that later insert requests will be sucessful * (provided xfrm_policy_lock is held throughout). @@ -1279,10 +1269,14 @@ static void xfrm_hash_rebuild(struct work_struct *work) } /* reset the bydst and inexact table in all directions */ - xfrm_hash_reset_inexact_table(net); - for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { - INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); + struct hlist_node *n; + + hlist_for_each_entry_safe(policy, n, + &net->xfrm.policy_inexact[dir], + bydst_inexact_list) + hlist_del_init(&policy->bydst_inexact_list); + hmask = net->xfrm.policy_bydst[dir].hmask; odst = net->xfrm.policy_bydst[dir].table; for (i = hmask; i >= 0; i--) @@ -1314,6 +1308,9 @@ static void xfrm_hash_rebuild(struct work_struct *work) newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); + + hlist_del_rcu(&policy->bydst); + if (!chain) { void *p = xfrm_policy_inexact_insert(policy, dir, 0); -- cgit v1.2.3 From 1d38900cb85d5d311dbd23c2c93294527b82cd2b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 4 Jan 2019 14:17:03 +0100 Subject: xfrm: policy: fix reinsertion on node merge "newpos" has wrong scope. It must be NULL on each iteration of the loop. Otherwise, when policy is to be inserted at the start, we would instead insert at point found by the previous loop-iteration instead. Also, we need to unlink the policy before we reinsert it to the new node, else we can get next-points-to-self loops. Because policies are only ordered by priority it is irrelevant which policy is "more recent" except when two policies have same priority. (the more recent one is placed after the older one). In these cases, we can use the ->pos id number to know which one is the 'older': the higher the id, the more recent the policy. So we only need to unlink all policies from the node that is about to be removed, and insert them to the replacement node. Fixes: 9cf545ebd591da ("xfrm: policy: store inexact policies in a tree ordered by destination address") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 24dfd1e47cf0..e691683223ee 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -823,13 +823,13 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net, u16 family) { unsigned int matched_s, matched_d; - struct hlist_node *newpos = NULL; struct xfrm_policy *policy, *p; matched_s = 0; matched_d = 0; list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + struct hlist_node *newpos = NULL; bool matches_s, matches_d; if (!policy->bydst_reinsert) @@ -839,7 +839,10 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net, policy->bydst_reinsert = false; hlist_for_each_entry(p, &n->hhead, bydst) { - if (policy->priority >= p->priority) + if (policy->priority > p->priority) + newpos = &p->bydst; + else if (policy->priority == p->priority && + policy->pos > p->pos) newpos = &p->bydst; else break; @@ -955,12 +958,11 @@ static void xfrm_policy_inexact_node_merge(struct net *net, family); } - hlist_for_each_entry(tmp, &v->hhead, bydst) - tmp->bydst_reinsert = true; - hlist_for_each_entry(tmp, &n->hhead, bydst) + hlist_for_each_entry(tmp, &v->hhead, bydst) { tmp->bydst_reinsert = true; + hlist_del_rcu(&tmp->bydst); + } - INIT_HLIST_HEAD(&n->hhead); xfrm_policy_inexact_list_reinsert(net, n, family); } -- cgit v1.2.3 From 12750abad517a991c4568969bc748db302ab52cd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 4 Jan 2019 14:17:05 +0100 Subject: xfrm: policy: fix infinite loop when merging src-nodes With very small change to test script we can trigger softlockup due to bogus assignment of 'p' (policy to be examined) on restart. Previously the two to-be-merged nodes had same address/prefixlength pair, so no erase/reinsert was necessary, we only had to append the list from node a to b. If prefix lengths are different, the node has to be deleted and re-inserted into the tree, with the updated prefix length. This was broken; due to bogus update to 'p' this loops forever. Add a 'restart' label and use that instead. While at it, don't perform the unneeded reinserts of the policies that are already sorted into the 'new' node. A previous patch in this series made xfrm_policy_inexact_list_reinsert() use the relative position indicator to sort policies according to age in case priorities are identical. Fixes: 6ac098b2a9d30 ("xfrm: policy: add 2nd-level saddr trees for inexact policies") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 15 +++++++-------- tools/testing/selftests/net/xfrm_policy.sh | 4 ++-- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e691683223ee..8cfd75b62396 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -886,12 +886,13 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net, struct rb_root *new, u16 family) { - struct rb_node **p, *parent = NULL; struct xfrm_pol_inexact_node *node; + struct rb_node **p, *parent; /* we should not have another subtree here */ WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root)); - +restart: + parent = NULL; p = &new->rb_node; while (*p) { u8 prefixlen; @@ -911,12 +912,11 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net, } else { struct xfrm_policy *tmp; - hlist_for_each_entry(tmp, &node->hhead, bydst) - tmp->bydst_reinsert = true; - hlist_for_each_entry(tmp, &n->hhead, bydst) + hlist_for_each_entry(tmp, &n->hhead, bydst) { tmp->bydst_reinsert = true; + hlist_del_rcu(&tmp->bydst); + } - INIT_HLIST_HEAD(&node->hhead); xfrm_policy_inexact_list_reinsert(net, node, family); if (node->prefixlen == n->prefixlen) { @@ -928,8 +928,7 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net, kfree_rcu(n, rcu); n = node; n->prefixlen = prefixlen; - *p = new->rb_node; - parent = NULL; + goto restart; } } diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh index 8ce54600d4d1..71d7fdc513c1 100755 --- a/tools/testing/selftests/net/xfrm_policy.sh +++ b/tools/testing/selftests/net/xfrm_policy.sh @@ -78,8 +78,8 @@ do_overlap() # adds a new node in the 10.0.0.0/24 tree (dst node exists). ip -net $ns xfrm policy add src 10.2.0.0/24 dst 10.0.0.0/24 dir fwd priority 200 action block - # adds a 10.2.0.0/24 node, but for different dst. - ip -net $ns xfrm policy add src 10.2.0.0/24 dst 10.0.1.0/24 dir fwd priority 200 action block + # adds a 10.2.0.0/23 node, but for different dst. + ip -net $ns xfrm policy add src 10.2.0.0/23 dst 10.0.1.0/24 dir fwd priority 200 action block # dst now overlaps with the 10.0.1.0/24 ESP policy in fwd. # kernel must 'promote' existing one (10.0.0.0/24) to 10.0.0.0/23. -- cgit v1.2.3 From dd9ee3444014e8f28c0eefc9fffc9ac9c5248c12 Mon Sep 17 00:00:00 2001 From: Su Yanjun Date: Sun, 6 Jan 2019 21:31:20 -0500 Subject: vti4: Fix a ipip packet processing bug in 'IPCOMP' virtual tunnel Recently we run a network test over ipcomp virtual tunnel.We find that if a ipv4 packet needs fragment, then the peer can't receive it. We deep into the code and find that when packet need fragment the smaller fragment will be encapsulated by ipip not ipcomp. So when the ipip packet goes into xfrm, it's skb->dev is not properly set. The ipv4 reassembly code always set skb'dev to the last fragment's dev. After ipv4 defrag processing, when the kernel rp_filter parameter is set, the skb will be drop by -EXDEV error. This patch adds compatible support for the ipip process in ipcomp virtual tunnel. Signed-off-by: Su Yanjun Signed-off-by: Steffen Klassert --- net/ipv4/ip_vti.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index d7b43e700023..68a21bf75dd0 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -74,6 +74,33 @@ drop: return 0; } +static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + struct ip_tunnel *tunnel; + const struct iphdr *iph = ip_hdr(skb); + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + if (tunnel) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + + skb->dev = tunnel->dev; + + return xfrm_input(skb, nexthdr, spi, encap_type); + } + + return -EINVAL; +drop: + kfree_skb(skb); + return 0; +} + static int vti_rcv(struct sk_buff *skb) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; @@ -82,6 +109,14 @@ static int vti_rcv(struct sk_buff *skb) return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); } +static int vti_rcv_ipip(struct sk_buff *skb) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + + return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0); +} + static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -435,6 +470,12 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .priority = 100, }; +static struct xfrm_tunnel ipip_handler __read_mostly = { + .handler = vti_rcv_ipip, + .err_handler = vti4_err, + .priority = 0, +}; + static int __net_init vti_init_net(struct net *net) { int err; @@ -603,6 +644,13 @@ static int __init vti_init(void) if (err < 0) goto xfrm_proto_comp_failed; + msg = "ipip tunnel"; + err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + if (err < 0) { + pr_info("%s: cant't register tunnel\n",__func__); + goto xfrm_tunnel_failed; + } + msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); if (err < 0) @@ -612,6 +660,8 @@ static int __init vti_init(void) rtnl_link_failed: xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); +xfrm_tunnel_failed: + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: -- cgit v1.2.3 From e7f45099442a380f8e087b6a8aadc36e887df1cc Mon Sep 17 00:00:00 2001 From: Santosh kumar pradhan Date: Wed, 9 Jan 2019 22:08:26 +0530 Subject: sunrpc: kernel BUG at kernel/cred.c:825! Init missing debug member magic with CRED_MAGIC. Signed-off-by: Santosh kumar pradhan Reported-by: Dave Jones Signed-off-by: Anna Schumaker --- net/sunrpc/auth.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 1ff9768f5456..f3023bbc0b7f 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -41,6 +41,9 @@ static unsigned long number_cred_unused; static struct cred machine_cred = { .usage = ATOMIC_INIT(1), +#ifdef CONFIG_DEBUG_CREDENTIALS + .magic = CRED_MAGIC, +#endif }; /* -- cgit v1.2.3 From a799aea0988ea0d1b1f263e996fdad2f6133c680 Mon Sep 17 00:00:00 2001 From: wenxu Date: Wed, 9 Jan 2019 10:40:11 +0800 Subject: netfilter: nft_flow_offload: Fix reverse route lookup Using the following example: client 1.1.1.7 ---> 2.2.2.7 which dnat to 10.0.0.7 server The first reply packet (ie. syn+ack) uses an incorrect destination address for the reverse route lookup since it uses: daddr = ct->tuplehash[!dir].tuple.dst.u3.ip; which is 2.2.2.7 in the scenario that is described above, while this should be: daddr = ct->tuplehash[dir].tuple.src.u3.ip; that is 10.0.0.7. Signed-off-by: wenxu Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 974525eb92df..ccdb8f5ababb 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -29,10 +29,10 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, memset(&fl, 0, sizeof(fl)); switch (nft_pf(pkt)) { case NFPROTO_IPV4: - fl.u.ip4.daddr = ct->tuplehash[!dir].tuple.dst.u3.ip; + fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; break; case NFPROTO_IPV6: - fl.u.ip6.daddr = ct->tuplehash[!dir].tuple.dst.u3.in6; + fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; break; } -- cgit v1.2.3 From 31aa6503a15ba00182ea6dbbf51afb63bf9e851d Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 8 Jan 2019 18:12:24 -0800 Subject: bpf: correctly set initial window on active Fast Open sender The existing BPF TCP initial congestion window (TCP_BPF_IW) does not to work on (active) Fast Open sender. This is because it changes the (initial) window only if data_segs_out is zero -- but data_segs_out is also incremented on SYN-data. This patch fixes the issue by proerly accounting for SYN-data additionally. Fixes: fc7478103c84 ("bpf: Adds support for setting initial cwnd") Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Acked-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 447dd1bad31f..2b3b436ef545 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4203,7 +4203,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, /* Only some options are supported */ switch (optname) { case TCP_BPF_IW: - if (val <= 0 || tp->data_segs_out > 0) + if (val <= 0 || tp->data_segs_out > tp->syn_data) ret = -EINVAL; else tp->snd_cwnd = val; -- cgit v1.2.3 From 35e6103861a3a970de6c84688c6e7a1f65b164ca Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 9 Jan 2019 14:37:34 +0100 Subject: xfrm: refine validation of template and selector families The check assumes that in transport mode, the first templates family must match the address family of the policy selector. Syzkaller managed to build a template using MODE_ROUTEOPTIMIZATION, with ipv4-in-ipv6 chain, leading to following splat: BUG: KASAN: stack-out-of-bounds in xfrm_state_find+0x1db/0x1854 Read of size 4 at addr ffff888063e57aa0 by task a.out/2050 xfrm_state_find+0x1db/0x1854 xfrm_tmpl_resolve+0x100/0x1d0 xfrm_resolve_and_create_bundle+0x108/0x1000 [..] Problem is that addresses point into flowi4 struct, but xfrm_state_find treats them as being ipv6 because it uses templ->encap_family is used (AF_INET6 in case of reproducer) rather than family (AF_INET). This patch inverts the logic: Enforce 'template family must match selector' EXCEPT for tunnel and BEET mode. In BEET and Tunnel mode, xfrm_tmpl_resolve_one will have remote/local address pointers changed to point at the addresses found in the template, rather than the flowi ones, so no oob read will occur. Reported-by: 3ntr0py1337@gmail.com Reported-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 277c1c46fe94..c6d26afcf89d 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1488,10 +1488,15 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) if (!ut[i].family) ut[i].family = family; - if ((ut[i].mode == XFRM_MODE_TRANSPORT) && - (ut[i].family != prev_family)) - return -EINVAL; - + switch (ut[i].mode) { + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + break; + default: + if (ut[i].family != prev_family) + return -EINVAL; + break; + } if (ut[i].mode >= XFRM_MODE_MAX) return -EINVAL; -- cgit v1.2.3 From 4a06fa67c4da20148803525151845276cdb995c1 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 7 Jan 2019 16:47:33 -0500 Subject: ip: on queued skb use skb_header_pointer instead of pskb_may_pull Commit 2efd4fca703a ("ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull") avoided a read beyond the end of the skb linear segment by calling pskb_may_pull. That function can trigger a BUG_ON in pskb_expand_head if the skb is shared, which it is when when peeking. It can also return ENOMEM. Avoid both by switching to safer skb_header_pointer. Fixes: 2efd4fca703a ("ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull") Reported-by: syzbot Suggested-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 12 +++++------- net/ipv6/datagram.c | 10 ++++------ 2 files changed, 9 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index fffcc130900e..82f341e84fae 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -148,19 +148,17 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { + __be16 _ports[2], *ports; struct sockaddr_in sin; - __be16 *ports; - int end; - - end = skb_transport_offset(skb) + 4; - if (end > 0 && !pskb_may_pull(skb, end)) - return; /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ - ports = (__be16 *)skb_transport_header(skb); + ports = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_ports), &_ports); + if (!ports) + return; sin.sin_family = AF_INET; sin.sin_addr.s_addr = ip_hdr(skb)->daddr; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index bde08aa549f3..c2262a7e2088 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -700,17 +700,15 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, } if (np->rxopt.bits.rxorigdstaddr) { struct sockaddr_in6 sin6; - __be16 *ports; - int end; + __be16 _ports[2], *ports; - end = skb_transport_offset(skb) + 4; - if (end <= 0 || pskb_may_pull(skb, end)) { + ports = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_ports), &_ports); + if (ports) { /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ - ports = (__be16 *)skb_transport_header(skb); - sin6.sin6_family = AF_INET6; sin6.sin6_addr = ipv6_hdr(skb)->daddr; sin6.sin6_port = ports[1]; -- cgit v1.2.3 From 85704cb8dcfd88d351bfc87faaeba1c8214f3177 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 8 Jan 2019 12:30:00 +0300 Subject: net/core/neighbour: tell kmemleak about hash tables This fixes false-positive kmemleak reports about leaked neighbour entries: unreferenced object 0xffff8885c6e4d0a8 (size 1024): comm "softirq", pid 0, jiffies 4294922664 (age 167640.804s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 20 2c f3 83 ff ff ff ff ........ ,...... 08 c0 ef 5f 84 88 ff ff 01 8c 7d 02 01 00 00 00 ..._......}..... backtrace: [<00000000748509fe>] ip6_finish_output2+0x887/0x1e40 [<0000000036d7a0d8>] ip6_output+0x1ba/0x600 [<0000000027ea7dba>] ip6_send_skb+0x92/0x2f0 [<00000000d6e2111d>] udp_v6_send_skb.isra.24+0x680/0x15e0 [<000000000668a8be>] udpv6_sendmsg+0x18c9/0x27a0 [<000000004bd5fa90>] sock_sendmsg+0xb3/0xf0 [<000000008227b29f>] ___sys_sendmsg+0x745/0x8f0 [<000000008698009d>] __sys_sendmsg+0xde/0x170 [<00000000889dacf1>] do_syscall_64+0x9b/0x400 [<0000000081cdb353>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<000000005767ed39>] 0xffffffffffffffff Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/core/neighbour.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 763a7b08df67..3e27a779f288 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -18,6 +18,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -443,12 +444,14 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; - if (size <= PAGE_SIZE) + if (size <= PAGE_SIZE) { buckets = kzalloc(size, GFP_ATOMIC); - else + } else { buckets = (struct neighbour __rcu **) __get_free_pages(GFP_ATOMIC | __GFP_ZERO, get_order(size)); + kmemleak_alloc(buckets, size, 0, GFP_ATOMIC); + } if (!buckets) { kfree(ret); return NULL; @@ -468,10 +471,12 @@ static void neigh_hash_free_rcu(struct rcu_head *head) size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); struct neighbour __rcu **buckets = nht->hash_buckets; - if (size <= PAGE_SIZE) + if (size <= PAGE_SIZE) { kfree(buckets); - else + } else { + kmemleak_free(buckets); free_pages((unsigned long)buckets, get_order(size)); + } kfree(nht); } -- cgit v1.2.3 From 7d033c9f6a7fd3821af75620a0257db87c2b552a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Jan 2019 04:06:14 -0800 Subject: ipv6: fix kernel-infoleak in ipv6_local_error() This patch makes sure the flow label in the IPv6 header forged in ipv6_local_error() is initialized. BUG: KMSAN: kernel-infoleak in _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32 CPU: 1 PID: 24675 Comm: syz-executor1 Not tainted 4.20.0-rc7+ #4 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x173/0x1d0 lib/dump_stack.c:113 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613 kmsan_internal_check_memory+0x455/0xb00 mm/kmsan/kmsan.c:675 kmsan_copy_to_user+0xab/0xc0 mm/kmsan/kmsan_hooks.c:601 _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32 copy_to_user include/linux/uaccess.h:177 [inline] move_addr_to_user+0x2e9/0x4f0 net/socket.c:227 ___sys_recvmsg+0x5d7/0x1140 net/socket.c:2284 __sys_recvmsg net/socket.c:2327 [inline] __do_sys_recvmsg net/socket.c:2337 [inline] __se_sys_recvmsg+0x2fa/0x450 net/socket.c:2334 __x64_sys_recvmsg+0x4a/0x70 net/socket.c:2334 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x457ec9 Code: 6d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 3b b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f8750c06c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002f RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457ec9 RDX: 0000000000002000 RSI: 0000000020000400 RDI: 0000000000000005 RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f8750c076d4 R13: 00000000004c4a60 R14: 00000000004d8140 R15: 00000000ffffffff Uninit was stored to memory at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:204 [inline] kmsan_save_stack mm/kmsan/kmsan.c:219 [inline] kmsan_internal_chain_origin+0x134/0x230 mm/kmsan/kmsan.c:439 __msan_chain_origin+0x70/0xe0 mm/kmsan/kmsan_instr.c:200 ipv6_recv_error+0x1e3f/0x1eb0 net/ipv6/datagram.c:475 udpv6_recvmsg+0x398/0x2ab0 net/ipv6/udp.c:335 inet_recvmsg+0x4fb/0x600 net/ipv4/af_inet.c:830 sock_recvmsg_nosec net/socket.c:794 [inline] sock_recvmsg+0x1d1/0x230 net/socket.c:801 ___sys_recvmsg+0x4d5/0x1140 net/socket.c:2278 __sys_recvmsg net/socket.c:2327 [inline] __do_sys_recvmsg net/socket.c:2337 [inline] __se_sys_recvmsg+0x2fa/0x450 net/socket.c:2334 __x64_sys_recvmsg+0x4a/0x70 net/socket.c:2334 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:204 [inline] kmsan_internal_poison_shadow+0x92/0x150 mm/kmsan/kmsan.c:158 kmsan_kmalloc+0xa6/0x130 mm/kmsan/kmsan_hooks.c:176 kmsan_slab_alloc+0xe/0x10 mm/kmsan/kmsan_hooks.c:185 slab_post_alloc_hook mm/slab.h:446 [inline] slab_alloc_node mm/slub.c:2759 [inline] __kmalloc_node_track_caller+0xe18/0x1030 mm/slub.c:4383 __kmalloc_reserve net/core/skbuff.c:137 [inline] __alloc_skb+0x309/0xa20 net/core/skbuff.c:205 alloc_skb include/linux/skbuff.h:998 [inline] ipv6_local_error+0x1a7/0x9e0 net/ipv6/datagram.c:334 __ip6_append_data+0x129f/0x4fd0 net/ipv6/ip6_output.c:1311 ip6_make_skb+0x6cc/0xcf0 net/ipv6/ip6_output.c:1775 udpv6_sendmsg+0x3f8e/0x45d0 net/ipv6/udp.c:1384 inet_sendmsg+0x54a/0x720 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] __sys_sendto+0x8c4/0xac0 net/socket.c:1788 __do_sys_sendto net/socket.c:1800 [inline] __se_sys_sendto+0x107/0x130 net/socket.c:1796 __x64_sys_sendto+0x6e/0x90 net/socket.c:1796 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 Bytes 4-7 of 28 are uninitialized Memory access of size 28 starts at ffff8881937bfce0 Data copied to user address 0000000020000000 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/datagram.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c2262a7e2088..ee4a4e54d016 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -341,6 +341,7 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; + ip6_flow_hdr(iph, 0, 0); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; -- cgit v1.2.3 From c5715b8fabfca0ef85903f8bad2189940ed41cc8 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 8 Jan 2019 18:14:28 -0800 Subject: tcp: change txhash on SYN-data timeout Previously upon SYN timeouts the sender recomputes the txhash to try a different path. However this does not apply on the initial timeout of SYN-data (active Fast Open). Therefore an active IPv6 Fast Open connection may incur one second RTO penalty to take on a new path after the second SYN retransmission uses a new flow label. This patch removes this undesirable behavior so Fast Open changes the flow label just like the regular connections. This also helps avoid falsely disabling Fast Open on the sender which triggers after two consecutive SYN timeouts on Fast Open. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f87dbc78b6bc..71a29e9c0620 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -226,7 +226,7 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { dst_negative_advice(sk); - } else if (!tp->syn_data && !tp->syn_fastopen) { + } else { sk_rethink_txhash(sk); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; -- cgit v1.2.3 From e2c8d550a973bb34fc28bc8d0ec996f84562fb8a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 2 Jan 2019 19:14:31 -0800 Subject: netfilter: ebtables: account ebt_table_info to kmemcg The [ip,ip6,arp]_tables use x_tables_info internally and the underlying memory is already accounted to kmemcg. Do the same for ebtables. The syzbot, by using setsockopt(EBT_SO_SET_ENTRIES), was able to OOM the whole system from a restricted memcg, a potential DoS. By accounting the ebt_table_info, the memory used for ebt_table_info can be contained within the memcg of the allocating process. However the lifetime of ebt_table_info is independent of the allocating process and is tied to the network namespace. So, the oom-killer will not be able to relieve the memory pressure due to ebt_table_info memory. The memory for ebt_table_info is allocated through vmalloc. Currently vmalloc does not handle the oom-killed allocating process correctly and one large allocation can bypass memcg limit enforcement. So, with this patch, at least the small allocations will be contained. For large allocations, we need to fix vmalloc. Reported-by: syzbot+7713f3aa67be76b1552c@syzkaller.appspotmail.com Signed-off-by: Shakeel Butt Reviewed-by: Kirill Tkhai Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 491828713e0b..5e55cef0cec3 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1137,14 +1137,16 @@ static int do_replace(struct net *net, const void __user *user, tmp.name[sizeof(tmp.name) - 1] = 0; countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids; - newinfo = vmalloc(sizeof(*newinfo) + countersize); + newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT, + PAGE_KERNEL); if (!newinfo) return -ENOMEM; if (countersize) memset(newinfo->counters, 0, countersize); - newinfo->entries = vmalloc(tmp.entries_size); + newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT, + PAGE_KERNEL); if (!newinfo->entries) { ret = -ENOMEM; goto free_newinfo; -- cgit v1.2.3 From 10f4e765879e514e1ce7f52ed26603047af196e2 Mon Sep 17 00:00:00 2001 From: wenxu Date: Thu, 10 Jan 2019 14:51:35 +0800 Subject: netfilter: nft_flow_offload: fix interaction with vrf slave device In the forward chain, the iif is changed from slave device to master vrf device. Thus, flow offload does not find a match on the lower slave device. This patch uses the cached route, ie. dst->dev, to update the iif and oif fields in the flow entry. After this patch, the following example works fine: # ip addr add dev eth0 1.1.1.1/24 # ip addr add dev eth1 10.0.0.1/24 # ip link add user1 type vrf table 1 # ip l set user1 up # ip l set dev eth0 master user1 # ip l set dev eth1 master user1 # nft add table firewall # nft add flowtable f fb1 { hook ingress priority 0 \; devices = { eth0, eth1 } \; } # nft add chain f ftb-all {type filter hook forward priority 0 \; policy accept \; } # nft add rule f ftb-all ct zone 1 ip protocol tcp flow offload @fb1 # nft add rule f ftb-all ct zone 1 ip protocol udp flow offload @fb1 Signed-off-by: wenxu Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 - net/netfilter/nf_flow_table_core.c | 5 +++-- net/netfilter/nft_flow_offload.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 7d5cda7ce32a..3e370cb36263 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -84,7 +84,6 @@ struct flow_offload { struct nf_flow_route { struct { struct dst_entry *dst; - int ifindex; } tuple[FLOW_OFFLOAD_DIR_MAX]; }; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index fa0844e2a68d..c0c72ae9df42 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -28,6 +28,7 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, { struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple; + struct dst_entry *other_dst = route->tuple[!dir].dst; struct dst_entry *dst = route->tuple[dir].dst; ft->dir = dir; @@ -50,8 +51,8 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, ft->src_port = ctt->src.u.tcp.port; ft->dst_port = ctt->dst.u.tcp.port; - ft->iifidx = route->tuple[dir].ifindex; - ft->oifidx = route->tuple[!dir].ifindex; + ft->iifidx = other_dst->dev->ifindex; + ft->oifidx = dst->dev->ifindex; ft->dst_cache = dst; } diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index ccdb8f5ababb..188c6bbf4e16 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -30,9 +30,11 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, switch (nft_pf(pkt)) { case NFPROTO_IPV4: fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; + fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; break; case NFPROTO_IPV6: fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; + fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; break; } @@ -41,9 +43,7 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, return -ENOENT; route->tuple[dir].dst = this_dst; - route->tuple[dir].ifindex = nft_in(pkt)->ifindex; route->tuple[!dir].dst = other_dst; - route->tuple[!dir].ifindex = nft_out(pkt)->ifindex; return 0; } -- cgit v1.2.3 From 5b4cb650e569db2e6a09d2fa0ef8eb789a0ac5d8 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:24:34 +0900 Subject: net: bpfilter: use cleanup callback to release umh_info Now, UMH process is killed, do_exit() calls the umh_info->cleanup callback to release members of the umh_info. This patch makes bpfilter_umh's cleanup routine to use the umh_info->cleanup callback. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 11 ++++++++--- net/bpfilter/bpfilter_kern.c | 23 ++++++++++------------- net/ipv4/bpfilter/sockopt.c | 33 ++++++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index f02cee0225d4..70ffeed280e9 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -3,13 +3,18 @@ #define _LINUX_BPFILTER_H #include +#include struct sock; int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen); int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen); -extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname, - char __user *optval, - unsigned int optlen, bool is_set); +struct bpfilter_umh_ops { + struct umh_info info; + int (*sockopt)(struct sock *sk, int optname, + char __user *optval, + unsigned int optlen, bool is_set); +}; +extern struct bpfilter_umh_ops bpfilter_ops; #endif diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 7acfc83087d5..a68940b74c01 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -13,7 +13,6 @@ extern char bpfilter_umh_start; extern char bpfilter_umh_end; -static struct umh_info info; /* since ip_getsockopt() can run in parallel, serialize access to umh */ static DEFINE_MUTEX(bpfilter_lock); @@ -28,16 +27,13 @@ static void shutdown_umh(struct umh_info *info) force_sig(SIGKILL, tsk); put_task_struct(tsk); } - fput(info->pipe_to_umh); - fput(info->pipe_from_umh); - info->pid = 0; } static void __stop_umh(void) { if (IS_ENABLED(CONFIG_INET)) { - bpfilter_process_sockopt = NULL; - shutdown_umh(&info); + bpfilter_ops.sockopt = NULL; + shutdown_umh(&bpfilter_ops.info); } } @@ -64,9 +60,10 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.addr = (long __force __user)optval; req.len = optlen; mutex_lock(&bpfilter_lock); - if (!info.pid) + if (!bpfilter_ops.info.pid) goto out; - n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos); + n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req), + &pos); if (n != sizeof(req)) { pr_err("write fail %zd\n", n); __stop_umh(); @@ -74,7 +71,8 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, goto out; } pos = 0; - n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos); + n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply), + &pos); if (n != sizeof(reply)) { pr_err("read fail %zd\n", n); __stop_umh(); @@ -92,13 +90,12 @@ static int __init load_umh(void) int err; /* fork usermode process */ - info.cmdline = "bpfilter_umh"; err = fork_usermode_blob(&bpfilter_umh_start, &bpfilter_umh_end - &bpfilter_umh_start, - &info); + &bpfilter_ops.info); if (err) return err; - pr_info("Loaded bpfilter_umh pid %d\n", info.pid); + pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid); /* health check that usermode process started correctly */ if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { @@ -106,7 +103,7 @@ static int __init load_umh(void) return -EFAULT; } if (IS_ENABLED(CONFIG_INET)) - bpfilter_process_sockopt = &__bpfilter_process_sockopt; + bpfilter_ops.sockopt = &__bpfilter_process_sockopt; return 0; } diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 5e04ed25bc0e..c326cfbc0f62 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -1,28 +1,37 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include #include #include #include +#include +#include -int (*bpfilter_process_sockopt)(struct sock *sk, int optname, - char __user *optval, - unsigned int optlen, bool is_set); -EXPORT_SYMBOL_GPL(bpfilter_process_sockopt); +struct bpfilter_umh_ops bpfilter_ops; +EXPORT_SYMBOL_GPL(bpfilter_ops); + +static void bpfilter_umh_cleanup(struct umh_info *info) +{ + fput(info->pipe_to_umh); + fput(info->pipe_from_umh); + info->pid = 0; +} static int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set) { - if (!bpfilter_process_sockopt) { + if (!bpfilter_ops.sockopt) { int err = request_module("bpfilter"); if (err) return err; - if (!bpfilter_process_sockopt) + if (!bpfilter_ops.sockopt) return -ECHILD; } - return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set); + return bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); } int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, @@ -41,3 +50,13 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, return bpfilter_mbox_request(sk, optname, optval, len, false); } + +static int __init bpfilter_sockopt_init(void) +{ + bpfilter_ops.info.cmdline = "bpfilter_umh"; + bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; + + return 0; +} + +module_init(bpfilter_sockopt_init); -- cgit v1.2.3 From 61fbf5933d42b02f552123af5a87a06335a3b4db Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:24:53 +0900 Subject: net: bpfilter: restart bpfilter_umh when error occurred The bpfilter_umh will be stopped via __stop_umh() when the bpfilter error occurred. The bpfilter_umh() couldn't start again because there is no restart routine. The section of the bpfilter_umh_{start/end} is no longer .init.rodata because these area should be reused in the restart routine. hence the section name is changed to .bpfilter_umh. The bpfilter_ops->start() is restart callback. it will be called when bpfilter_umh is stopped. The stop bit means bpfilter_umh is stopped. this bit is set by both start and stop routine. Before this patch, Test commands: $ iptables -vnL $ kill -9 $ iptables -vnL [ 480.045136] bpfilter: write fail -32 $ iptables -vnL All iptables commands will fail. After this patch, Test commands: $ iptables -vnL $ kill -9 $ iptables -vnL $ iptables -vnL Now, all iptables commands will work. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 2 ++ net/bpfilter/bpfilter_kern.c | 37 +++++++++++++++++++++++++++---------- net/bpfilter/bpfilter_umh_blob.S | 2 +- net/ipv4/bpfilter/sockopt.c | 11 ++++++++++- 4 files changed, 40 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index 70ffeed280e9..8ebcbdd70bdc 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -15,6 +15,8 @@ struct bpfilter_umh_ops { int (*sockopt)(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set); + int (*start)(void); + bool stop; }; extern struct bpfilter_umh_ops bpfilter_ops; #endif diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index a68940b74c01..c0fcde910a7a 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -16,13 +16,14 @@ extern char bpfilter_umh_end; /* since ip_getsockopt() can run in parallel, serialize access to umh */ static DEFINE_MUTEX(bpfilter_lock); -static void shutdown_umh(struct umh_info *info) +static void shutdown_umh(void) { struct task_struct *tsk; - if (!info->pid) + if (bpfilter_ops.stop) return; - tsk = get_pid_task(find_vpid(info->pid), PIDTYPE_PID); + + tsk = get_pid_task(find_vpid(bpfilter_ops.info.pid), PIDTYPE_PID); if (tsk) { force_sig(SIGKILL, tsk); put_task_struct(tsk); @@ -31,10 +32,8 @@ static void shutdown_umh(struct umh_info *info) static void __stop_umh(void) { - if (IS_ENABLED(CONFIG_INET)) { - bpfilter_ops.sockopt = NULL; - shutdown_umh(&bpfilter_ops.info); - } + if (IS_ENABLED(CONFIG_INET)) + shutdown_umh(); } static void stop_umh(void) @@ -85,7 +84,7 @@ out: return ret; } -static int __init load_umh(void) +static int start_umh(void) { int err; @@ -95,6 +94,7 @@ static int __init load_umh(void) &bpfilter_ops.info); if (err) return err; + bpfilter_ops.stop = false; pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid); /* health check that usermode process started correctly */ @@ -102,14 +102,31 @@ static int __init load_umh(void) stop_umh(); return -EFAULT; } - if (IS_ENABLED(CONFIG_INET)) - bpfilter_ops.sockopt = &__bpfilter_process_sockopt; return 0; } +static int __init load_umh(void) +{ + int err; + + if (!bpfilter_ops.stop) + return -EFAULT; + err = start_umh(); + if (!err && IS_ENABLED(CONFIG_INET)) { + bpfilter_ops.sockopt = &__bpfilter_process_sockopt; + bpfilter_ops.start = &start_umh; + } + + return err; +} + static void __exit fini_umh(void) { + if (IS_ENABLED(CONFIG_INET)) { + bpfilter_ops.start = NULL; + bpfilter_ops.sockopt = NULL; + } stop_umh(); } module_init(load_umh); diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S index 40311d10d2f2..7f1c521dcc2f 100644 --- a/net/bpfilter/bpfilter_umh_blob.S +++ b/net/bpfilter/bpfilter_umh_blob.S @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ - .section .init.rodata, "a" + .section .bpfilter_umh, "a" .global bpfilter_umh_start bpfilter_umh_start: .incbin "net/bpfilter/bpfilter_umh" diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index c326cfbc0f62..de84ede4e765 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -14,6 +14,7 @@ EXPORT_SYMBOL_GPL(bpfilter_ops); static void bpfilter_umh_cleanup(struct umh_info *info) { + bpfilter_ops.stop = true; fput(info->pipe_to_umh); fput(info->pipe_from_umh); info->pid = 0; @@ -23,14 +24,21 @@ static int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set) { + int err; + if (!bpfilter_ops.sockopt) { - int err = request_module("bpfilter"); + err = request_module("bpfilter"); if (err) return err; if (!bpfilter_ops.sockopt) return -ECHILD; } + if (bpfilter_ops.stop) { + err = bpfilter_ops.start(); + if (err) + return err; + } return bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); } @@ -53,6 +61,7 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, static int __init bpfilter_sockopt_init(void) { + bpfilter_ops.stop = true; bpfilter_ops.info.cmdline = "bpfilter_umh"; bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; -- cgit v1.2.3 From 71a8508402b570127d6500c1ad456bbd33ccf187 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 9 Jan 2019 02:25:10 +0900 Subject: net: bpfilter: disallow to remove bpfilter module while being used The bpfilter.ko module can be removed while functions of the bpfilter.ko are executing. so panic can occurred. in order to protect that, locks can be used. a bpfilter_lock protects routines in the __bpfilter_process_sockopt() but it's not enough because __exit routine can be executed concurrently. Now, the bpfilter_umh can not run in parallel. So, the module do not removed while it's being used and it do not double-create UMH process. The members of the umh_info and the bpfilter_umh_ops are protected by the bpfilter_umh_ops.lock. test commands: while : do iptables -I FORWARD -m string --string ap --algo kmp & modprobe -rv bpfilter & done splat looks like: [ 298.623435] BUG: unable to handle kernel paging request at fffffbfff807440b [ 298.628512] #PF error: [normal kernel read fault] [ 298.633018] PGD 124327067 P4D 124327067 PUD 11c1a3067 PMD 119eb2067 PTE 0 [ 298.638859] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI [ 298.638859] CPU: 0 PID: 2997 Comm: iptables Not tainted 4.20.0+ #154 [ 298.638859] RIP: 0010:__mutex_lock+0x6b9/0x16a0 [ 298.638859] Code: c0 00 00 e8 89 82 ff ff 80 bd 8f fc ff ff 00 0f 85 d9 05 00 00 48 8b 85 80 fc ff ff 48 bf 00 00 00 00 00 fc ff df 48 c1 e8 03 <80> 3c 38 00 0f 85 1d 0e 00 00 48 8b 85 c8 fc ff ff 49 39 47 58 c6 [ 298.638859] RSP: 0018:ffff88810e7777a0 EFLAGS: 00010202 [ 298.638859] RAX: 1ffffffff807440b RBX: ffff888111bd4d80 RCX: 0000000000000000 [ 298.638859] RDX: 1ffff110235ff806 RSI: ffff888111bd5538 RDI: dffffc0000000000 [ 298.638859] RBP: ffff88810e777b30 R08: 0000000080000002 R09: 0000000000000000 [ 298.638859] R10: 0000000000000000 R11: 0000000000000000 R12: fffffbfff168a42c [ 298.638859] R13: ffff888111bd4d80 R14: ffff8881040e9a05 R15: ffffffffc03a2000 [ 298.638859] FS: 00007f39e3758700(0000) GS:ffff88811ae00000(0000) knlGS:0000000000000000 [ 298.638859] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 298.638859] CR2: fffffbfff807440b CR3: 000000011243e000 CR4: 00000000001006f0 [ 298.638859] Call Trace: [ 298.638859] ? mutex_lock_io_nested+0x1560/0x1560 [ 298.638859] ? kasan_kmalloc+0xa0/0xd0 [ 298.638859] ? kmem_cache_alloc+0x1c2/0x260 [ 298.638859] ? __alloc_file+0x92/0x3c0 [ 298.638859] ? alloc_empty_file+0x43/0x120 [ 298.638859] ? alloc_file_pseudo+0x220/0x330 [ 298.638859] ? sock_alloc_file+0x39/0x160 [ 298.638859] ? __sys_socket+0x113/0x1d0 [ 298.638859] ? __x64_sys_socket+0x6f/0xb0 [ 298.638859] ? do_syscall_64+0x138/0x560 [ 298.638859] ? entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 298.638859] ? __alloc_file+0x92/0x3c0 [ 298.638859] ? init_object+0x6b/0x80 [ 298.638859] ? cyc2ns_read_end+0x10/0x10 [ 298.638859] ? cyc2ns_read_end+0x10/0x10 [ 298.638859] ? hlock_class+0x140/0x140 [ 298.638859] ? sched_clock_local+0xd4/0x140 [ 298.638859] ? sched_clock_local+0xd4/0x140 [ 298.638859] ? check_flags.part.37+0x440/0x440 [ 298.638859] ? __lock_acquire+0x4f90/0x4f90 [ 298.638859] ? set_rq_offline.part.89+0x140/0x140 [ ... ] Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 2 ++ net/bpfilter/bpfilter_kern.c | 28 +++++++++++----------------- net/ipv4/bpfilter/sockopt.c | 22 ++++++++++++++++------ 3 files changed, 29 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index 8ebcbdd70bdc..d815622cd31e 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -12,6 +12,8 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen); struct bpfilter_umh_ops { struct umh_info info; + /* since ip_getsockopt() can run in parallel, serialize access to umh */ + struct mutex lock; int (*sockopt)(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set); diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index c0fcde910a7a..7ee4fea93637 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -13,9 +13,6 @@ extern char bpfilter_umh_start; extern char bpfilter_umh_end; -/* since ip_getsockopt() can run in parallel, serialize access to umh */ -static DEFINE_MUTEX(bpfilter_lock); - static void shutdown_umh(void) { struct task_struct *tsk; @@ -36,13 +33,6 @@ static void __stop_umh(void) shutdown_umh(); } -static void stop_umh(void) -{ - mutex_lock(&bpfilter_lock); - __stop_umh(); - mutex_unlock(&bpfilter_lock); -} - static int __bpfilter_process_sockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set) @@ -58,7 +48,6 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.cmd = optname; req.addr = (long __force __user)optval; req.len = optlen; - mutex_lock(&bpfilter_lock); if (!bpfilter_ops.info.pid) goto out; n = __kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req), @@ -80,7 +69,6 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, } ret = reply.status; out: - mutex_unlock(&bpfilter_lock); return ret; } @@ -99,7 +87,7 @@ static int start_umh(void) /* health check that usermode process started correctly */ if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { - stop_umh(); + shutdown_umh(); return -EFAULT; } @@ -110,24 +98,30 @@ static int __init load_umh(void) { int err; - if (!bpfilter_ops.stop) - return -EFAULT; + mutex_lock(&bpfilter_ops.lock); + if (!bpfilter_ops.stop) { + err = -EFAULT; + goto out; + } err = start_umh(); if (!err && IS_ENABLED(CONFIG_INET)) { bpfilter_ops.sockopt = &__bpfilter_process_sockopt; bpfilter_ops.start = &start_umh; } - +out: + mutex_unlock(&bpfilter_ops.lock); return err; } static void __exit fini_umh(void) { + mutex_lock(&bpfilter_ops.lock); if (IS_ENABLED(CONFIG_INET)) { + shutdown_umh(); bpfilter_ops.start = NULL; bpfilter_ops.sockopt = NULL; } - stop_umh(); + mutex_unlock(&bpfilter_ops.lock); } module_init(load_umh); module_exit(fini_umh); diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index de84ede4e765..1e976bb93d99 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -14,10 +14,12 @@ EXPORT_SYMBOL_GPL(bpfilter_ops); static void bpfilter_umh_cleanup(struct umh_info *info) { + mutex_lock(&bpfilter_ops.lock); bpfilter_ops.stop = true; fput(info->pipe_to_umh); fput(info->pipe_from_umh); info->pid = 0; + mutex_unlock(&bpfilter_ops.lock); } static int bpfilter_mbox_request(struct sock *sk, int optname, @@ -25,21 +27,28 @@ static int bpfilter_mbox_request(struct sock *sk, int optname, unsigned int optlen, bool is_set) { int err; - + mutex_lock(&bpfilter_ops.lock); if (!bpfilter_ops.sockopt) { + mutex_unlock(&bpfilter_ops.lock); err = request_module("bpfilter"); + mutex_lock(&bpfilter_ops.lock); if (err) - return err; - if (!bpfilter_ops.sockopt) - return -ECHILD; + goto out; + if (!bpfilter_ops.sockopt) { + err = -ECHILD; + goto out; + } } if (bpfilter_ops.stop) { err = bpfilter_ops.start(); if (err) - return err; + goto out; } - return bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); + err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); +out: + mutex_unlock(&bpfilter_ops.lock); + return err; } int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, @@ -61,6 +70,7 @@ int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, static int __init bpfilter_sockopt_init(void) { + mutex_init(&bpfilter_ops.lock); bpfilter_ops.stop = true; bpfilter_ops.info.cmdline = "bpfilter_umh"; bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; -- cgit v1.2.3 From 41d1c8839e5f8cb781cc635f12791decee8271b7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 8 Jan 2019 18:45:05 +0100 Subject: net: clear skb->tstamp in bridge forwarding path Matteo reported forwarding issues inside the linux bridge, if the enslaved interfaces use the fq qdisc. Similar to commit 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths"), we need to clear the tstamp field in the bridge forwarding path. Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.") Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Reported-and-tested-by: Matteo Croce Signed-off-by: Paolo Abeni Acked-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 5372e2042adf..2cb8da465b98 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -65,6 +65,7 @@ EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + skb->tstamp = 0; return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, net, sk, skb, NULL, skb->dev, br_dev_queue_push_xmit); -- cgit v1.2.3 From 2314e879747e82896f51cce4488f6a00f3e1af7b Mon Sep 17 00:00:00 2001 From: Henry Yen Date: Mon, 14 Jan 2019 17:59:43 +0800 Subject: netfilter: nft_flow_offload: fix checking method of conntrack helper This patch uses nfct_help() to detect whether an established connection needs conntrack helper instead of using test_bit(IPS_HELPER_BIT, &ct->status). The reason is that IPS_HELPER_BIT is only set when using explicit CT target. However, in the case that a device enables conntrack helper via command "echo 1 > /proc/sys/net/netfilter/nf_conntrack_helper", the status of IPS_HELPER_BIT will not present any change, and consequently it loses the checking ability in the context. Signed-off-by: Henry Yen Reviewed-by: Ryder Lee Tested-by: John Crispin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 188c6bbf4e16..6e6b9adf7d38 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -12,6 +12,7 @@ #include #include #include +#include struct nft_flow_offload { struct nft_flowtable *flowtable; @@ -66,6 +67,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; + const struct nf_conn_help *help; enum ip_conntrack_info ctinfo; struct nf_flow_route route; struct flow_offload *flow; @@ -88,7 +90,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, goto out; } - if (test_bit(IPS_HELPER_BIT, &ct->status)) + help = nfct_help(ct); + if (help) goto out; if (ctinfo == IP_CT_NEW || -- cgit v1.2.3 From cc5b5d3565048ae57d14e5674a5fb085b2ab0193 Mon Sep 17 00:00:00 2001 From: Krzysztof Kazimierczak Date: Thu, 10 Jan 2019 20:29:02 +0100 Subject: xsk: Check if a queue exists during umem setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the xdp_umem_assign_dev() path, the xsk code does not check if a queue for which umem is to be created exists. It leads to a situation where umem is not assigned to any Tx/Rx queue of a netdevice, without notifying the stack about an error. This affects both XDP_SKB and XDP_DRV modes - in case of XDP_DRV_ZC, queue index is checked by the driver. This patch fixes xsk code, so that in both XDP_SKB and XDP_DRV mode of AF_XDP, an error is returned when requested queue index exceedes an existing maximum. Fixes: c9b47cc1fabca ("xsk: fix bug when trying to use both copy and zero-copy on one queue id") Reported-by: Jakub Spizewski Signed-off-by: Krzysztof Kazimierczak Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index a264cf2accd0..d4de871e7d4d 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -41,13 +41,20 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) * not know if the device has more tx queues than rx, or the opposite. * This might also change during run time. */ -static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem, - u16 queue_id) +static int xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem, + u16 queue_id) { + if (queue_id >= max_t(unsigned int, + dev->real_num_rx_queues, + dev->real_num_tx_queues)) + return -EINVAL; + if (queue_id < dev->real_num_rx_queues) dev->_rx[queue_id].umem = umem; if (queue_id < dev->real_num_tx_queues) dev->_tx[queue_id].umem = umem; + + return 0; } struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, @@ -88,7 +95,10 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, goto out_rtnl_unlock; } - xdp_reg_umem_at_qid(dev, umem, queue_id); + err = xdp_reg_umem_at_qid(dev, umem, queue_id); + if (err) + goto out_rtnl_unlock; + umem->dev = dev; umem->queue_id = queue_id; if (force_copy) -- cgit v1.2.3 From e66721f0436396f779291a29616858b76bfd9415 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 2 Jan 2019 17:53:10 -0500 Subject: SUNRPC: Ensure rq_bytes_sent is reset before request transmission When we resend a request, ensure that the 'rq_bytes_sent' is reset to zero. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 1 - net/sunrpc/xprt.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 71d9599b5816..0878c793ce7f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1739,7 +1739,6 @@ rpc_xdr_encode(struct rpc_task *task) xdr_buf_init(&req->rq_rcv_buf, req->rq_rbuffer, req->rq_rcvsize); - req->rq_bytes_sent = 0; p = rpc_encode_header(task); if (p == NULL) { diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 73547d17d3c6..9075ae150ae5 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1151,6 +1151,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task) struct rpc_xprt *xprt = req->rq_xprt; if (xprt_request_need_enqueue_transmit(task, req)) { + req->rq_bytes_sent = 0; spin_lock(&xprt->queue_lock); /* * Requests that carry congestion control credits are added -- cgit v1.2.3 From 97b78ae96ba76f4ca2d8f5afee6a2e567ccb8f45 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 2 Jan 2019 17:53:13 -0500 Subject: SUNRPC: Ensure we respect the RPCSEC_GSS sequence number limit According to RFC2203, the RPCSEC_GSS sequence numbers are bounded to an upper limit of MAXSEQ = 0x80000000. Ensure that we handle that correctly. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/auth_gss/auth_gss.c | 12 +++++++++--- net/sunrpc/clnt.c | 19 ++++++++++++------- 2 files changed, 21 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index dc86713b32b6..1531b0219344 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1549,8 +1549,10 @@ gss_marshal(struct rpc_task *task, __be32 *p) cred_len = p++; spin_lock(&ctx->gc_seq_lock); - req->rq_seqno = ctx->gc_seq++; + req->rq_seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; spin_unlock(&ctx->gc_seq_lock); + if (req->rq_seqno == MAXSEQ) + goto out_expired; *p++ = htonl((u32) RPC_GSS_VERSION); *p++ = htonl((u32) ctx->gc_proc); @@ -1572,14 +1574,18 @@ gss_marshal(struct rpc_task *task, __be32 *p) mic.data = (u8 *)(p + 1); maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) { - clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); + goto out_expired; } else if (maj_stat != 0) { - printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat); + pr_warn("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat); + task->tk_status = -EIO; goto out_put_ctx; } p = xdr_encode_opaque(p, NULL, mic.len); gss_put_ctx(ctx); return p; +out_expired: + clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); + task->tk_status = -EKEYEXPIRED; out_put_ctx: gss_put_ctx(ctx); return NULL; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0878c793ce7f..d7ec6132c046 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1741,11 +1741,8 @@ rpc_xdr_encode(struct rpc_task *task) req->rq_rcvsize); p = rpc_encode_header(task); - if (p == NULL) { - printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n"); - rpc_exit(task, -EIO); + if (p == NULL) return; - } encode = task->tk_msg.rpc_proc->p_encode; if (encode == NULL) @@ -1770,10 +1767,17 @@ call_encode(struct rpc_task *task) /* Did the encode result in an error condition? */ if (task->tk_status != 0) { /* Was the error nonfatal? */ - if (task->tk_status == -EAGAIN || task->tk_status == -ENOMEM) + switch (task->tk_status) { + case -EAGAIN: + case -ENOMEM: rpc_delay(task, HZ >> 4); - else + break; + case -EKEYEXPIRED: + task->tk_action = call_refresh; + break; + default: rpc_exit(task, task->tk_status); + } return; } @@ -2335,7 +2339,8 @@ rpc_encode_header(struct rpc_task *task) *p++ = htonl(clnt->cl_vers); /* program version */ *p++ = htonl(task->tk_msg.rpc_proc->p_proc); /* procedure */ p = rpcauth_marshcred(task, p); - req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p); + if (p) + req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p); return p; } -- cgit v1.2.3 From deaa5c96c2f7e8b934088a1e70a0fe8797bd1149 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 9 Jan 2019 10:04:57 -0500 Subject: SUNRPC: Address Kerberos performance/behavior regression When using Kerberos with v4.20, I've observed frequent connection loss on heavy workloads. I traced it down to the client underrunning the GSS sequence number window -- NFS servers are required to drop the RPC with the low sequence number, and also drop the connection to signal that an RPC was dropped. Bisected to commit 918f3c1fe83c ("SUNRPC: Improve latency for interactive tasks"). I've got a one-line workaround for this issue, which is easy to backport to v4.20 while a more permanent solution is being derived. Essentially, tk_owner-based sorting is disabled for RPCs that carry a GSS sequence number. Fixes: 918f3c1fe83c ("SUNRPC: Improve latency for interactive ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 9075ae150ae5..f1ec2110efeb 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1178,7 +1178,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task) INIT_LIST_HEAD(&req->rq_xmit2); goto out; } - } else { + } else if (!req->rq_seqno) { list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { if (pos->rq_task->tk_owner != task->tk_owner) continue; -- cgit v1.2.3 From ab5098fa25b91cb6fe0a0676f17abb64f2bbf024 Mon Sep 17 00:00:00 2001 From: Olivier Matz Date: Wed, 9 Jan 2019 10:57:21 +0100 Subject: ip6_gre: fix tunnel list corruption for x-netns In changelink ops, the ip6gre_net pointer is retrieved from dev_net(dev), which is wrong in case of x-netns. Thus, the tunnel is not unlinked from its current list and is relinked into another net namespace. This corrupts the tunnel lists and can later trigger a kernel oops. Fix this by retrieving the netns from device private area. Fixes: c8632fc30bb0 ("net: ip6_gre: Split up ip6gre_changelink()") Cc: Petr Machata Signed-off-by: Olivier Matz Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 09d0826742f8..f2543df50035 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -2025,9 +2025,9 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id); + struct ip6_tnl *t = netdev_priv(dev); + struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); struct __ip6_tnl_parm p; - struct ip6_tnl *t; t = ip6gre_changelink_common(dev, tb, data, &p, extack); if (IS_ERR(t)) -- cgit v1.2.3 From f97f4dd8b3bb9d0993d2491e0f22024c68109184 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 9 Jan 2019 09:57:39 +0000 Subject: net: ipv4: Fix memory leak in network namespace dismantle IPv4 routing tables are flushed in two cases: 1. In response to events in the netdev and inetaddr notification chains 2. When a network namespace is being dismantled In both cases only routes associated with a dead nexthop group are flushed. However, a nexthop group will only be marked as dead in case it is populated with actual nexthops using a nexthop device. This is not the case when the route in question is an error route (e.g., 'blackhole', 'unreachable'). Therefore, when a network namespace is being dismantled such routes are not flushed and leaked [1]. To reproduce: # ip netns add blue # ip -n blue route add unreachable 192.0.2.0/24 # ip netns del blue Fix this by not skipping error routes that are not marked with RTNH_F_DEAD when flushing the routing tables. To prevent the flushing of such routes in case #1, add a parameter to fib_table_flush() that indicates if the table is flushed as part of namespace dismantle or not. Note that this problem does not exist in IPv6 since error routes are associated with the loopback device. [1] unreferenced object 0xffff888066650338 (size 56): comm "ip", pid 1206, jiffies 4294786063 (age 26.235s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 b0 1c 62 61 80 88 ff ff ..........ba.... e8 8b a1 64 80 88 ff ff 00 07 00 08 fe 00 00 00 ...d............ backtrace: [<00000000856ed27d>] inet_rtm_newroute+0x129/0x220 [<00000000fcdfc00a>] rtnetlink_rcv_msg+0x397/0xa20 [<00000000cb85801a>] netlink_rcv_skb+0x132/0x380 [<00000000ebc991d2>] netlink_unicast+0x4c0/0x690 [<0000000014f62875>] netlink_sendmsg+0x929/0xe10 [<00000000bac9d967>] sock_sendmsg+0xc8/0x110 [<00000000223e6485>] ___sys_sendmsg+0x77a/0x8f0 [<000000002e94f880>] __sys_sendmsg+0xf7/0x250 [<00000000ccb1fa72>] do_syscall_64+0x14d/0x610 [<00000000ffbe3dae>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<000000003a8b605b>] 0xffffffffffffffff unreferenced object 0xffff888061621c88 (size 48): comm "ip", pid 1206, jiffies 4294786063 (age 26.235s) hex dump (first 32 bytes): 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk 6b 6b 6b 6b 6b 6b 6b 6b d8 8e 26 5f 80 88 ff ff kkkkkkkk..&_.... backtrace: [<00000000733609e3>] fib_table_insert+0x978/0x1500 [<00000000856ed27d>] inet_rtm_newroute+0x129/0x220 [<00000000fcdfc00a>] rtnetlink_rcv_msg+0x397/0xa20 [<00000000cb85801a>] netlink_rcv_skb+0x132/0x380 [<00000000ebc991d2>] netlink_unicast+0x4c0/0x690 [<0000000014f62875>] netlink_sendmsg+0x929/0xe10 [<00000000bac9d967>] sock_sendmsg+0xc8/0x110 [<00000000223e6485>] ___sys_sendmsg+0x77a/0x8f0 [<000000002e94f880>] __sys_sendmsg+0xf7/0x250 [<00000000ccb1fa72>] do_syscall_64+0x14d/0x610 [<00000000ffbe3dae>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<000000003a8b605b>] 0xffffffffffffffff Fixes: 8cced9eff1d4 ("[NETNS]: Enable routing configuration in non-initial namespace.") Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- net/ipv4/fib_frontend.c | 4 ++-- net/ipv4/fib_trie.c | 15 ++++++++++++--- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index c5969762a8f4..9c8214d2116d 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -241,7 +241,7 @@ int fib_table_delete(struct net *, struct fib_table *, struct fib_config *, struct netlink_ext_ack *extack); int fib_table_dump(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter); -int fib_table_flush(struct net *net, struct fib_table *table); +int fib_table_flush(struct net *net, struct fib_table *table, bool flush_all); struct fib_table *fib_trie_unmerge(struct fib_table *main_tb); void fib_table_flush_external(struct fib_table *table); void fib_free_table(struct fib_table *tb); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6df95be96311..fe4f6a624238 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -203,7 +203,7 @@ static void fib_flush(struct net *net) struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) - flushed += fib_table_flush(net, tb); + flushed += fib_table_flush(net, tb, false); } if (flushed) @@ -1463,7 +1463,7 @@ static void ip_fib_net_exit(struct net *net) hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); - fib_table_flush(net, tb); + fib_table_flush(net, tb, true); fib_free_table(tb); } } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 237c9f72b265..a573e37e0615 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1856,7 +1856,7 @@ void fib_table_flush_external(struct fib_table *tb) } /* Caller must hold RTNL. */ -int fib_table_flush(struct net *net, struct fib_table *tb) +int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; @@ -1904,8 +1904,17 @@ int fib_table_flush(struct net *net, struct fib_table *tb) hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; - if (!fi || !(fi->fib_flags & RTNH_F_DEAD) || - tb->tb_id != fa->tb_id) { + if (!fi || tb->tb_id != fa->tb_id || + (!(fi->fib_flags & RTNH_F_DEAD) && + !fib_props[fa->fa_type].error)) { + slen = fa->fa_slen; + continue; + } + + /* Do not flush error routes if network namespace is + * not being dismantled + */ + if (!flush_all && fib_props[fa->fa_type].error) { slen = fa->fa_slen; continue; } -- cgit v1.2.3 From 80b3671e9377916bf2b02e56113fa7377ce5705a Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 10 Jan 2019 11:17:42 +0800 Subject: ip6_gre: update version related info when changing link We forgot to update ip6erspan version related info when changing link, which will cause setting new hwid failed. Reported-by: Jianlin Shi Fixes: 94d7d8f292870 ("ip6_gre: add erspan v2 support") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f2543df50035..026f08735549 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1169,6 +1169,10 @@ static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t, t->parms.i_flags = p->i_flags; t->parms.o_flags = p->o_flags; t->parms.fwmark = p->fwmark; + t->parms.erspan_ver = p->erspan_ver; + t->parms.index = p->index; + t->parms.dir = p->dir; + t->parms.hwid = p->hwid; dst_cache_reset(&t->dst_cache); } -- cgit v1.2.3 From f6bab199315b70fd83fe3ee0947bc84c7a35f3d4 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Wed, 9 Jan 2019 17:09:42 +0100 Subject: sched: Avoid dereferencing skb pointer after child enqueue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parent qdiscs may dereference the pointer to the enqueued skb after enqueue. However, both CAKE and TBF call consume_skb() on the original skb when splitting GSO packets, leading to a potential use-after-free in the parent. Fix this by avoiding dereferencing the skb pointer after enqueueing to the child. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cbs.c | 3 ++- net/sched/sch_drr.c | 3 ++- net/sched/sch_dsmark.c | 3 ++- net/sched/sch_hfsc.c | 5 ++--- net/sched/sch_htb.c | 3 ++- net/sched/sch_prio.c | 3 ++- net/sched/sch_qfq.c | 16 +++++++++------- net/sched/sch_tbf.c | 3 ++- 8 files changed, 23 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index e689e11b6d0f..c6a502933fe7 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -88,13 +88,14 @@ static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { + unsigned int len = qdisc_pkt_len(skb); int err; err = child->ops->enqueue(skb, child, to_free); if (err != NET_XMIT_SUCCESS) return err; - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index cdebaed0f8cf..feaf47178653 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -350,6 +350,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + unsigned int len = qdisc_pkt_len(skb); struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; int err = 0; @@ -376,7 +377,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, cl->deficit = cl->quantum; } - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; return err; } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index f6f480784bc6..42471464ded3 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -199,6 +199,7 @@ static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl, static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + unsigned int len = qdisc_pkt_len(skb); struct dsmark_qdisc_data *p = qdisc_priv(sch); int err; @@ -271,7 +272,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b18ec1f6de60..6bb8f73a8473 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1539,6 +1539,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) static int hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + unsigned int len = qdisc_pkt_len(skb); struct hfsc_class *cl; int uninitialized_var(err); @@ -1560,8 +1561,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) } if (cl->qdisc->q.qlen == 1) { - unsigned int len = qdisc_pkt_len(skb); - if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); if (cl->cl_flags & HFSC_FSC) @@ -1576,7 +1575,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) } - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 58b449490757..30f9da7e1076 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -581,6 +581,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { int uninitialized_var(ret); + unsigned int len = qdisc_pkt_len(skb); struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = htb_classify(skb, sch, &ret); @@ -610,7 +611,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, htb_activate(q, cl); } - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; } diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index cdf68706e40f..847141cd900f 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -72,6 +72,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) static int prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + unsigned int len = qdisc_pkt_len(skb); struct Qdisc *qdisc; int ret; @@ -88,7 +89,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index dc37c4ead439..8d5e55d5bed2 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1210,6 +1210,7 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + unsigned int len = qdisc_pkt_len(skb), gso_segs; struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; struct qfq_aggregate *agg; @@ -1224,17 +1225,17 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); - if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) { + if (unlikely(cl->agg->lmax < len)) { pr_debug("qfq: increasing maxpkt from %u to %u for class %u", - cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid); - err = qfq_change_agg(sch, cl, cl->agg->class_weight, - qdisc_pkt_len(skb)); + cl->agg->lmax, len, cl->common.classid); + err = qfq_change_agg(sch, cl, cl->agg->class_weight, len); if (err) { cl->qstats.drops++; return qdisc_drop(skb, sch, to_free); } } + gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); @@ -1245,8 +1246,9 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - bstats_update(&cl->bstats, skb); - qdisc_qstats_backlog_inc(sch, skb); + cl->bstats.bytes += len; + cl->bstats.packets += gso_segs; + sch->qstats.backlog += len; ++sch->q.qlen; agg = cl->agg; @@ -1254,7 +1256,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (cl->qdisc->q.qlen != 1) { if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && list_first_entry(&agg->active, struct qfq_class, alist) - == cl && cl->deficit < qdisc_pkt_len(skb)) + == cl && cl->deficit < len) list_move_tail(&cl->alist, &agg->active); return err; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 942dcca09cf2..7f272a9070c5 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -185,6 +185,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); + unsigned int len = qdisc_pkt_len(skb); int ret; if (qdisc_pkt_len(skb) > q->max_size) { @@ -200,7 +201,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, return ret; } - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; } -- cgit v1.2.3 From 37d9cf1a3ce35de3df6f7d209bfb1f50cf188cea Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Wed, 9 Jan 2019 17:09:43 +0100 Subject: sched: Fix detection of empty queues in child qdiscs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several qdiscs check on enqueue whether the packet was enqueued to a class with an empty queue, in which case the class is activated. This is done by checking if the qlen is exactly 1 after enqueue. However, if GSO splitting is enabled in the child qdisc, a single packet can result in a qlen longer than 1. This means the activation check fails, leading to a stalled queue. Fix this by checking if the queue is empty *before* enqueue, and running the activation logic if this was the case. Reported-by: Pete Heist Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_drr.c | 4 +++- net/sched/sch_hfsc.c | 4 +++- net/sched/sch_qfq.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index feaf47178653..09b800991065 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -354,6 +354,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; int err = 0; + bool first; cl = drr_classify(skb, sch, &err); if (cl == NULL) { @@ -363,6 +364,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } + first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { @@ -372,7 +374,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - if (cl->qdisc->q.qlen == 1) { + if (first) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 6bb8f73a8473..24cc220a3218 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1542,6 +1542,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) unsigned int len = qdisc_pkt_len(skb); struct hfsc_class *cl; int uninitialized_var(err); + bool first; cl = hfsc_classify(skb, sch, &err); if (cl == NULL) { @@ -1551,6 +1552,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) return err; } + first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { @@ -1560,7 +1562,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) return err; } - if (cl->qdisc->q.qlen == 1) { + if (first) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); if (cl->cl_flags & HFSC_FSC) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 8d5e55d5bed2..29f5c4a24688 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1215,6 +1215,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct qfq_class *cl; struct qfq_aggregate *agg; int err = 0; + bool first; cl = qfq_classify(skb, sch, &err); if (cl == NULL) { @@ -1236,6 +1237,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; + first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); @@ -1253,7 +1255,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, agg = cl->agg; /* if the queue was not empty, then done here */ - if (cl->qdisc->q.qlen != 1) { + if (!first) { if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && list_first_entry(&agg->active, struct qfq_class, alist) == cl && cl->deficit < len) -- cgit v1.2.3 From 8c6c37fdc20ec9ffaa342f827a8e20afe736fb0c Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Wed, 9 Jan 2019 17:09:44 +0100 Subject: sch_cake: Correctly update parent qlen when splitting GSO packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To ensure parent qdiscs have the same notion of the number of enqueued packets even after splitting a GSO packet, update the qdisc tree with the number of packets that was added due to the split. Reported-by: Pete Heist Tested-by: Pete Heist Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index b910cd5c56f7..73940293700d 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1667,7 +1667,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); - unsigned int slen = 0; + unsigned int slen = 0, numsegs = 0; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) @@ -1683,6 +1683,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow_queue_add(flow, segs); sch->q.qlen++; + numsegs++; slen += segs->len; q->buffer_used += segs->truesize; b->packets++; @@ -1696,7 +1697,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, sch->qstats.backlog += slen; q->avg_window_bytes += slen; - qdisc_tree_reduce_backlog(sch, 1, len); + qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); consume_skb(skb); } else { /* not splitting */ -- cgit v1.2.3 From a88289f4ddee4165d5f796bd99e09eec3133c16b Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 14 Jan 2019 17:22:24 +0800 Subject: tipc: fix uninit-value in in tipc_conn_rcv_sub syzbot reported: BUG: KMSAN: uninit-value in tipc_conn_rcv_sub+0x184/0x950 net/tipc/topsrv.c:373 CPU: 0 PID: 66 Comm: kworker/u4:4 Not tainted 4.17.0-rc3+ #88 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: tipc_rcv tipc_conn_recv_work Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:113 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683 tipc_conn_rcv_sub+0x184/0x950 net/tipc/topsrv.c:373 tipc_conn_rcv_from_sock net/tipc/topsrv.c:409 [inline] tipc_conn_recv_work+0x3cd/0x560 net/tipc/topsrv.c:424 process_one_work+0x12c6/0x1f60 kernel/workqueue.c:2145 worker_thread+0x113c/0x24f0 kernel/workqueue.c:2279 kthread+0x539/0x720 kernel/kthread.c:239 ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:412 Local variable description: ----s.i@tipc_conn_recv_work Variable was created at: tipc_conn_recv_work+0x65/0x560 net/tipc/topsrv.c:419 process_one_work+0x12c6/0x1f60 kernel/workqueue.c:2145 In tipc_conn_rcv_from_sock(), it always supposes the length of message received from sock_recvmsg() is not smaller than the size of struct tipc_subscr. However, this assumption is false. Especially when the length of received message is shorter than struct tipc_subscr size, we will end up touching uninitialized fields in tipc_conn_rcv_sub(). Reported-by: syzbot+8951a3065ee7fd6d6e23@syzkaller.appspotmail.com Reported-by: syzbot+75e6e042c5bbf691fc82@syzkaller.appspotmail.com Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/topsrv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index efb16f69bd2c..a457c0fbbef1 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -398,7 +398,7 @@ static int tipc_conn_rcv_from_sock(struct tipc_conn *con) ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); if (ret == -EWOULDBLOCK) return -EWOULDBLOCK; - if (ret > 0) { + if (ret == sizeof(s)) { read_lock_bh(&sk->sk_callback_lock); ret = tipc_conn_rcv_sub(srv, con, &s); read_unlock_bh(&sk->sk_callback_lock); -- cgit v1.2.3 From 8b66fee7f8ee18f9c51260e7a43ab37db5177a05 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 14 Jan 2019 17:22:25 +0800 Subject: tipc: fix uninit-value in tipc_nl_compat_link_reset_stats syzbot reports following splat: BUG: KMSAN: uninit-value in strlen+0x3b/0xa0 lib/string.c:486 CPU: 1 PID: 11057 Comm: syz-executor0 Not tainted 4.20.0-rc7+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x173/0x1d0 lib/dump_stack.c:113 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613 __msan_warning+0x82/0xf0 mm/kmsan/kmsan_instr.c:295 strlen+0x3b/0xa0 lib/string.c:486 nla_put_string include/net/netlink.h:1154 [inline] tipc_nl_compat_link_reset_stats+0x1f0/0x360 net/tipc/netlink_compat.c:760 __tipc_nl_compat_doit net/tipc/netlink_compat.c:311 [inline] tipc_nl_compat_doit+0x3aa/0xaf0 net/tipc/netlink_compat.c:344 tipc_nl_compat_handle net/tipc/netlink_compat.c:1107 [inline] tipc_nl_compat_recv+0x14d7/0x2760 net/tipc/netlink_compat.c:1210 genl_family_rcv_msg net/netlink/genetlink.c:601 [inline] genl_rcv_msg+0x185f/0x1a60 net/netlink/genetlink.c:626 netlink_rcv_skb+0x444/0x640 net/netlink/af_netlink.c:2477 genl_rcv+0x63/0x80 net/netlink/genetlink.c:637 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline] netlink_unicast+0xf40/0x1020 net/netlink/af_netlink.c:1336 netlink_sendmsg+0x127f/0x1300 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xdb9/0x11b0 net/socket.c:2116 __sys_sendmsg net/socket.c:2154 [inline] __do_sys_sendmsg net/socket.c:2163 [inline] __se_sys_sendmsg+0x305/0x460 net/socket.c:2161 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x457ec9 Code: 6d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 3b b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f2557338c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457ec9 RDX: 0000000000000000 RSI: 00000000200001c0 RDI: 0000000000000003 RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f25573396d4 R13: 00000000004cb478 R14: 00000000004d86c8 R15: 00000000ffffffff Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:204 [inline] kmsan_internal_poison_shadow+0x92/0x150 mm/kmsan/kmsan.c:158 kmsan_kmalloc+0xa6/0x130 mm/kmsan/kmsan_hooks.c:176 kmsan_slab_alloc+0xe/0x10 mm/kmsan/kmsan_hooks.c:185 slab_post_alloc_hook mm/slab.h:446 [inline] slab_alloc_node mm/slub.c:2759 [inline] __kmalloc_node_track_caller+0xe18/0x1030 mm/slub.c:4383 __kmalloc_reserve net/core/skbuff.c:137 [inline] __alloc_skb+0x309/0xa20 net/core/skbuff.c:205 alloc_skb include/linux/skbuff.h:998 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1182 [inline] netlink_sendmsg+0xb82/0x1300 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xdb9/0x11b0 net/socket.c:2116 __sys_sendmsg net/socket.c:2154 [inline] __do_sys_sendmsg net/socket.c:2163 [inline] __se_sys_sendmsg+0x305/0x460 net/socket.c:2161 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 The uninitialised access happened in tipc_nl_compat_link_reset_stats: nla_put_string(skb, TIPC_NLA_LINK_NAME, name) This is because name string is not validated before it's used. Reported-by: syzbot+e01d94b5a4c266be6e4c@syzkaller.appspotmail.com Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 77e4b2418f30..b2b115b22871 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -87,6 +87,11 @@ static int tipc_skb_tailroom(struct sk_buff *skb) return limit; } +static inline int TLV_GET_DATA_LEN(struct tlv_desc *tlv) +{ + return TLV_GET_LEN(tlv) - TLV_SPACE(0); +} + static int tipc_add_tlv(struct sk_buff *skb, u16 type, void *data, u16 len) { struct tlv_desc *tlv = (struct tlv_desc *)skb_tail_pointer(skb); @@ -166,6 +171,11 @@ static struct sk_buff *tipc_get_err_tlv(char *str) return buf; } +static inline bool string_is_valid(char *s, int len) +{ + return memchr(s, '\0', len) ? true : false; +} + static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, struct tipc_nl_compat_msg *msg, struct sk_buff *arg) @@ -750,6 +760,7 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd, { char *name; struct nlattr *link; + int len; name = (char *)TLV_DATA(msg->req); @@ -757,6 +768,10 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd, if (!link) return -EMSGSIZE; + len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_LINK_NAME); + if (!string_is_valid(name, len)) + return -EINVAL; + if (nla_put_string(skb, TIPC_NLA_LINK_NAME, name)) return -EMSGSIZE; -- cgit v1.2.3 From 0762216c0ad2a2fccd63890648eca491f2c83d9a Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 14 Jan 2019 17:22:26 +0800 Subject: tipc: fix uninit-value in tipc_nl_compat_bearer_enable syzbot reported: BUG: KMSAN: uninit-value in strlen+0x3b/0xa0 lib/string.c:484 CPU: 1 PID: 6371 Comm: syz-executor652 Not tainted 4.19.0-rc8+ #70 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x306/0x460 lib/dump_stack.c:113 kmsan_report+0x1a2/0x2e0 mm/kmsan/kmsan.c:917 __msan_warning+0x7c/0xe0 mm/kmsan/kmsan_instr.c:500 strlen+0x3b/0xa0 lib/string.c:484 nla_put_string include/net/netlink.h:1011 [inline] tipc_nl_compat_bearer_enable+0x238/0x7b0 net/tipc/netlink_compat.c:389 __tipc_nl_compat_doit net/tipc/netlink_compat.c:311 [inline] tipc_nl_compat_doit+0x39f/0xae0 net/tipc/netlink_compat.c:344 tipc_nl_compat_recv+0x147c/0x2760 net/tipc/netlink_compat.c:1107 genl_family_rcv_msg net/netlink/genetlink.c:601 [inline] genl_rcv_msg+0x185c/0x1a20 net/netlink/genetlink.c:626 netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2454 genl_rcv+0x63/0x80 net/netlink/genetlink.c:637 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x166d/0x1720 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x1391/0x1420 net/netlink/af_netlink.c:1908 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xe47/0x1200 net/socket.c:2116 __sys_sendmsg net/socket.c:2154 [inline] __do_sys_sendmsg net/socket.c:2163 [inline] __se_sys_sendmsg+0x307/0x460 net/socket.c:2161 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161 do_syscall_64+0xbe/0x100 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x440179 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fffef7beee8 EFLAGS: 00000213 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440179 RDX: 0000000000000000 RSI: 0000000020000100 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8 R10: 0000000000000000 R11: 0000000000000213 R12: 0000000000401a00 R13: 0000000000401a90 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:255 [inline] kmsan_internal_poison_shadow+0xc8/0x1d0 mm/kmsan/kmsan.c:180 kmsan_kmalloc+0xa4/0x120 mm/kmsan/kmsan_hooks.c:104 kmsan_slab_alloc+0x10/0x20 mm/kmsan/kmsan_hooks.c:113 slab_post_alloc_hook mm/slab.h:446 [inline] slab_alloc_node mm/slub.c:2727 [inline] __kmalloc_node_track_caller+0xb43/0x1400 mm/slub.c:4360 __kmalloc_reserve net/core/skbuff.c:138 [inline] __alloc_skb+0x422/0xe90 net/core/skbuff.c:206 alloc_skb include/linux/skbuff.h:996 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1189 [inline] netlink_sendmsg+0xcaf/0x1420 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xe47/0x1200 net/socket.c:2116 __sys_sendmsg net/socket.c:2154 [inline] __do_sys_sendmsg net/socket.c:2163 [inline] __se_sys_sendmsg+0x307/0x460 net/socket.c:2161 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161 do_syscall_64+0xbe/0x100 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 The root cause is that we don't validate whether bear name is a valid string in tipc_nl_compat_bearer_enable(). Meanwhile, we also fix the same issue in the following functions: tipc_nl_compat_bearer_disable() tipc_nl_compat_link_stat_dump() tipc_nl_compat_media_set() tipc_nl_compat_bearer_set() Reported-by: syzbot+b33d5cae0efd35dbfe77@syzkaller.appspotmail.com Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'net') diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index b2b115b22871..68a0b7308936 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -389,6 +389,7 @@ static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd, struct nlattr *prop; struct nlattr *bearer; struct tipc_bearer_config *b; + int len; b = (struct tipc_bearer_config *)TLV_DATA(msg->req); @@ -396,6 +397,10 @@ static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd, if (!bearer) return -EMSGSIZE; + len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_BEARER_NAME); + if (!string_is_valid(b->name, len)) + return -EINVAL; + if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, b->name)) return -EMSGSIZE; @@ -421,6 +426,7 @@ static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd, { char *name; struct nlattr *bearer; + int len; name = (char *)TLV_DATA(msg->req); @@ -428,6 +434,10 @@ static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd, if (!bearer) return -EMSGSIZE; + len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_BEARER_NAME); + if (!string_is_valid(name, len)) + return -EINVAL; + if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, name)) return -EMSGSIZE; @@ -488,6 +498,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, struct nlattr *prop[TIPC_NLA_PROP_MAX + 1]; struct nlattr *stats[TIPC_NLA_STATS_MAX + 1]; int err; + int len; if (!attrs[TIPC_NLA_LINK]) return -EINVAL; @@ -514,6 +525,11 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, return err; name = (char *)TLV_DATA(msg->req); + + len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_LINK_NAME); + if (!string_is_valid(name, len)) + return -EINVAL; + if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0) return 0; @@ -654,6 +670,7 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb, struct nlattr *prop; struct nlattr *media; struct tipc_link_config *lc; + int len; lc = (struct tipc_link_config *)TLV_DATA(msg->req); @@ -661,6 +678,10 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb, if (!media) return -EMSGSIZE; + len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME); + if (!string_is_valid(lc->name, len)) + return -EINVAL; + if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name)) return -EMSGSIZE; @@ -681,6 +702,7 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb, struct nlattr *prop; struct nlattr *bearer; struct tipc_link_config *lc; + int len; lc = (struct tipc_link_config *)TLV_DATA(msg->req); @@ -688,6 +710,10 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb, if (!bearer) return -EMSGSIZE; + len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_MEDIA_NAME); + if (!string_is_valid(lc->name, len)) + return -EINVAL; + if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name)) return -EMSGSIZE; -- cgit v1.2.3 From edf5ff04a45750ac8ce2435974f001dc9cfbf055 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 14 Jan 2019 17:22:27 +0800 Subject: tipc: fix uninit-value in tipc_nl_compat_link_set syzbot reports following splat: BUG: KMSAN: uninit-value in strlen+0x3b/0xa0 lib/string.c:486 CPU: 1 PID: 9306 Comm: syz-executor172 Not tainted 4.20.0-rc7+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x173/0x1d0 lib/dump_stack.c:113 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613 __msan_warning+0x82/0xf0 mm/kmsan/kmsan_instr.c:313 strlen+0x3b/0xa0 lib/string.c:486 nla_put_string include/net/netlink.h:1154 [inline] __tipc_nl_compat_link_set net/tipc/netlink_compat.c:708 [inline] tipc_nl_compat_link_set+0x929/0x1220 net/tipc/netlink_compat.c:744 __tipc_nl_compat_doit net/tipc/netlink_compat.c:311 [inline] tipc_nl_compat_doit+0x3aa/0xaf0 net/tipc/netlink_compat.c:344 tipc_nl_compat_handle net/tipc/netlink_compat.c:1107 [inline] tipc_nl_compat_recv+0x14d7/0x2760 net/tipc/netlink_compat.c:1210 genl_family_rcv_msg net/netlink/genetlink.c:601 [inline] genl_rcv_msg+0x185f/0x1a60 net/netlink/genetlink.c:626 netlink_rcv_skb+0x444/0x640 net/netlink/af_netlink.c:2477 genl_rcv+0x63/0x80 net/netlink/genetlink.c:637 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline] netlink_unicast+0xf40/0x1020 net/netlink/af_netlink.c:1336 netlink_sendmsg+0x127f/0x1300 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xdb9/0x11b0 net/socket.c:2116 __sys_sendmsg net/socket.c:2154 [inline] __do_sys_sendmsg net/socket.c:2163 [inline] __se_sys_sendmsg+0x305/0x460 net/socket.c:2161 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 The uninitialised access happened in nla_put_string(skb, TIPC_NLA_LINK_NAME, lc->name) This is because lc->name string is not validated before it's used. Reported-by: syzbot+d78b8a29241a195aefb8@syzkaller.appspotmail.com Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 68a0b7308936..89e6ae3b3c33 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -762,9 +762,14 @@ static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd, struct tipc_link_config *lc; struct tipc_bearer *bearer; struct tipc_media *media; + int len; lc = (struct tipc_link_config *)TLV_DATA(msg->req); + len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_LINK_NAME); + if (!string_is_valid(lc->name, len)) + return -EINVAL; + media = tipc_media_find(lc->name); if (media) { cmd->doit = &__tipc_nl_media_set; -- cgit v1.2.3 From 974cb0e3e7c963ced06c4e32c5b2884173fa5e01 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 14 Jan 2019 17:22:28 +0800 Subject: tipc: fix uninit-value in tipc_nl_compat_name_table_dump syzbot reported: BUG: KMSAN: uninit-value in __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline] BUG: KMSAN: uninit-value in __fswab32 include/uapi/linux/swab.h:59 [inline] BUG: KMSAN: uninit-value in tipc_nl_compat_name_table_dump+0x4a8/0xba0 net/tipc/netlink_compat.c:826 CPU: 0 PID: 6290 Comm: syz-executor848 Not tainted 4.19.0-rc8+ #70 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x306/0x460 lib/dump_stack.c:113 kmsan_report+0x1a2/0x2e0 mm/kmsan/kmsan.c:917 __msan_warning+0x7c/0xe0 mm/kmsan/kmsan_instr.c:500 __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline] __fswab32 include/uapi/linux/swab.h:59 [inline] tipc_nl_compat_name_table_dump+0x4a8/0xba0 net/tipc/netlink_compat.c:826 __tipc_nl_compat_dumpit+0x59e/0xdb0 net/tipc/netlink_compat.c:205 tipc_nl_compat_dumpit+0x63a/0x820 net/tipc/netlink_compat.c:270 tipc_nl_compat_handle net/tipc/netlink_compat.c:1151 [inline] tipc_nl_compat_recv+0x1402/0x2760 net/tipc/netlink_compat.c:1210 genl_family_rcv_msg net/netlink/genetlink.c:601 [inline] genl_rcv_msg+0x185c/0x1a20 net/netlink/genetlink.c:626 netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2454 genl_rcv+0x63/0x80 net/netlink/genetlink.c:637 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x166d/0x1720 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x1391/0x1420 net/netlink/af_netlink.c:1908 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xe47/0x1200 net/socket.c:2116 __sys_sendmsg net/socket.c:2154 [inline] __do_sys_sendmsg net/socket.c:2163 [inline] __se_sys_sendmsg+0x307/0x460 net/socket.c:2161 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161 do_syscall_64+0xbe/0x100 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x440179 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffecec49318 EFLAGS: 00000213 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440179 RDX: 0000000000000000 RSI: 0000000020000100 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8 R10: 0000000000000000 R11: 0000000000000213 R12: 0000000000401a00 R13: 0000000000401a90 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:255 [inline] kmsan_internal_poison_shadow+0xc8/0x1d0 mm/kmsan/kmsan.c:180 kmsan_kmalloc+0xa4/0x120 mm/kmsan/kmsan_hooks.c:104 kmsan_slab_alloc+0x10/0x20 mm/kmsan/kmsan_hooks.c:113 slab_post_alloc_hook mm/slab.h:446 [inline] slab_alloc_node mm/slub.c:2727 [inline] __kmalloc_node_track_caller+0xb43/0x1400 mm/slub.c:4360 __kmalloc_reserve net/core/skbuff.c:138 [inline] __alloc_skb+0x422/0xe90 net/core/skbuff.c:206 alloc_skb include/linux/skbuff.h:996 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1189 [inline] netlink_sendmsg+0xcaf/0x1420 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xe47/0x1200 net/socket.c:2116 __sys_sendmsg net/socket.c:2154 [inline] __do_sys_sendmsg net/socket.c:2163 [inline] __se_sys_sendmsg+0x307/0x460 net/socket.c:2161 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161 do_syscall_64+0xbe/0x100 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 We cannot take for granted the thing that the length of data contained in TLV is longer than the size of struct tipc_name_table_query in tipc_nl_compat_name_table_dump(). Reported-by: syzbot+06e771a754829716a327@syzkaller.appspotmail.com Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 89e6ae3b3c33..b90786ca1d21 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -824,6 +824,8 @@ static int tipc_nl_compat_name_table_dump_header(struct tipc_nl_compat_msg *msg) }; ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req); + if (TLV_GET_DATA_LEN(msg->req) < sizeof(struct tipc_name_table_query)) + return -EINVAL; depth = ntohl(ntq->depth); -- cgit v1.2.3 From 2753ca5d9009c180dbfd4c802c80983b4b6108d1 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 14 Jan 2019 17:22:29 +0800 Subject: tipc: fix uninit-value in tipc_nl_compat_doit BUG: KMSAN: uninit-value in tipc_nl_compat_doit+0x404/0xa10 net/tipc/netlink_compat.c:335 CPU: 0 PID: 4514 Comm: syz-executor485 Not tainted 4.16.0+ #87 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683 tipc_nl_compat_doit+0x404/0xa10 net/tipc/netlink_compat.c:335 tipc_nl_compat_recv+0x164b/0x2700 net/tipc/netlink_compat.c:1153 genl_family_rcv_msg net/netlink/genetlink.c:599 [inline] genl_rcv_msg+0x1686/0x1810 net/netlink/genetlink.c:624 netlink_rcv_skb+0x378/0x600 net/netlink/af_netlink.c:2447 genl_rcv+0x63/0x80 net/netlink/genetlink.c:635 netlink_unicast_kernel net/netlink/af_netlink.c:1311 [inline] netlink_unicast+0x166b/0x1740 net/netlink/af_netlink.c:1337 netlink_sendmsg+0x1048/0x1310 net/netlink/af_netlink.c:1900 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmsg net/socket.c:2080 [inline] SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091 SyS_sendmsg+0x54/0x80 net/socket.c:2087 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x43fda9 RSP: 002b:00007ffd0c184ba8 EFLAGS: 00000213 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fda9 RDX: 0000000000000000 RSI: 0000000020023000 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 00000000004002c8 R09: 00000000004002c8 R10: 00000000004002c8 R11: 0000000000000213 R12: 00000000004016d0 R13: 0000000000401760 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314 kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321 slab_post_alloc_hook mm/slab.h:445 [inline] slab_alloc_node mm/slub.c:2737 [inline] __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369 __kmalloc_reserve net/core/skbuff.c:138 [inline] __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206 alloc_skb include/linux/skbuff.h:984 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1183 [inline] netlink_sendmsg+0x9a6/0x1310 net/netlink/af_netlink.c:1875 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmsg net/socket.c:2080 [inline] SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091 SyS_sendmsg+0x54/0x80 net/socket.c:2087 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 In tipc_nl_compat_recv(), when the len variable returned by nlmsg_attrlen() is 0, the message is still treated as a valid one, which is obviously unresonable. When len is zero, it means the message not only doesn't contain any valid TLV payload, but also TLV header is not included. Under this stituation, tlv_type field in TLV header is still accessed in tipc_nl_compat_dumpit() or tipc_nl_compat_doit(), but the field space is obviously illegal. Of course, it is not initialized. Reported-by: syzbot+bca0dc46634781f08b38@syzkaller.appspotmail.com Reported-by: syzbot+6bdb590321a7ae40c1a6@syzkaller.appspotmail.com Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index b90786ca1d21..4ad3586da8f0 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1256,7 +1256,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) } len = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN); - if (len && !TLV_OK(msg.req, len)) { + if (!len || !TLV_OK(msg.req, len)) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED); err = -EOPNOTSUPP; goto send; -- cgit v1.2.3 From e122d845a01ece2ddd28b2f125ef2db66b8b627a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 10 Jan 2019 16:59:13 +0000 Subject: Revert "rxrpc: Allow failed client calls to be retried" The changes introduced to allow rxrpc calls to be retried creates an issue when it comes to refcounting afs_call structs. The problem is that when rxrpc_send_data() queues the last packet for an asynchronous call, the following sequence can occur: (1) The notify_end_tx callback is invoked which causes the state in the afs_call to be changed from AFS_CALL_CL_REQUESTING or AFS_CALL_SV_REPLYING. (2) afs_deliver_to_call() can then process event notifications from rxrpc on the async_work queue. (3) Delivery of events, such as an abort from the server, can cause the afs_call state to be changed to AFS_CALL_COMPLETE on async_work. (4) For an asynchronous call, afs_process_async_call() notes that the call is complete and tried to clean up all the refs on async_work. (5) rxrpc_send_data() might return the amount of data transferred (success) or an error - which could in turn reflect a local error or a received error. Synchronising the clean up after rxrpc_kernel_send_data() returns an error with the asynchronous cleanup is then tricky to get right. Mostly revert commit c038a58ccfd6704d4d7d60ed3d6a0fca13cf13a4. The two API functions the original commit added aren't currently used. This makes rxrpc_kernel_send_data() always return successfully if it queued the data it was given. Note that this doesn't affect synchronous calls since their Rx notification function merely pokes a wait queue and does not refcounting. The asynchronous call notification function *has* to do refcounting and pass a ref over the work item to avoid the need to sync the workqueue in call cleanup. Signed-off-by: David Howells Signed-off-by: David S. Miller --- Documentation/networking/rxrpc.txt | 45 ------------------ include/net/af_rxrpc.h | 16 ------- net/rxrpc/af_rxrpc.c | 70 --------------------------- net/rxrpc/ar-internal.h | 19 +++++--- net/rxrpc/call_object.c | 97 -------------------------------------- net/rxrpc/conn_client.c | 5 +- net/rxrpc/sendmsg.c | 24 +++++----- 7 files changed, 24 insertions(+), 252 deletions(-) (limited to 'net') diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index c9d052e0cf51..2df5894353d6 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -1000,51 +1000,6 @@ The kernel interface functions are as follows: size should be set when the call is begun. tx_total_len may not be less than zero. - (*) Check to see the completion state of a call so that the caller can assess - whether it needs to be retried. - - enum rxrpc_call_completion { - RXRPC_CALL_SUCCEEDED, - RXRPC_CALL_REMOTELY_ABORTED, - RXRPC_CALL_LOCALLY_ABORTED, - RXRPC_CALL_LOCAL_ERROR, - RXRPC_CALL_NETWORK_ERROR, - }; - - int rxrpc_kernel_check_call(struct socket *sock, struct rxrpc_call *call, - enum rxrpc_call_completion *_compl, - u32 *_abort_code); - - On return, -EINPROGRESS will be returned if the call is still ongoing; if - it is finished, *_compl will be set to indicate the manner of completion, - *_abort_code will be set to any abort code that occurred. 0 will be - returned on a successful completion, -ECONNABORTED will be returned if the - client failed due to a remote abort and anything else will return an - appropriate error code. - - The caller should look at this information to decide if it's worth - retrying the call. - - (*) Retry a client call. - - int rxrpc_kernel_retry_call(struct socket *sock, - struct rxrpc_call *call, - struct sockaddr_rxrpc *srx, - struct key *key); - - This attempts to partially reinitialise a call and submit it again while - reusing the original call's Tx queue to avoid the need to repackage and - re-encrypt the data to be sent. call indicates the call to retry, srx the - new address to send it to and key the encryption key to use for signing or - encrypting the packets. - - For this to work, the first Tx data packet must still be in the transmit - queue, and currently this is only permitted for local and network errors - and the call must not have been aborted. Any partially constructed Tx - packet is left as is and can continue being filled afterwards. - - It returns 0 if the call was requeued and an error otherwise. - (*) Get call RTT. u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 1adefe42c0a6..2bfb87eb98ce 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -20,18 +20,6 @@ struct sock; struct socket; struct rxrpc_call; -/* - * Call completion condition (state == RXRPC_CALL_COMPLETE). - */ -enum rxrpc_call_completion { - RXRPC_CALL_SUCCEEDED, /* - Normal termination */ - RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ - RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ - RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ - RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ - NR__RXRPC_CALL_COMPLETIONS -}; - /* * Debug ID counter for tracing. */ @@ -73,10 +61,6 @@ int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t, unsigned int); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); -int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *, - struct sockaddr_rxrpc *, struct key *); -int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *, - enum rxrpc_call_completion *, u32 *); u32 rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); void rxrpc_kernel_probe_life(struct socket *, struct rxrpc_call *); u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index a2522f9d71e2..96f2952bbdfd 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -418,76 +418,6 @@ u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) } EXPORT_SYMBOL(rxrpc_kernel_get_epoch); -/** - * rxrpc_kernel_check_call - Check a call's state - * @sock: The socket the call is on - * @call: The call to check - * @_compl: Where to store the completion state - * @_abort_code: Where to store any abort code - * - * Allow a kernel service to query the state of a call and find out the manner - * of its termination if it has completed. Returns -EINPROGRESS if the call is - * still going, 0 if the call finished successfully, -ECONNABORTED if the call - * was aborted and an appropriate error if the call failed in some other way. - */ -int rxrpc_kernel_check_call(struct socket *sock, struct rxrpc_call *call, - enum rxrpc_call_completion *_compl, u32 *_abort_code) -{ - if (call->state != RXRPC_CALL_COMPLETE) - return -EINPROGRESS; - smp_rmb(); - *_compl = call->completion; - *_abort_code = call->abort_code; - return call->error; -} -EXPORT_SYMBOL(rxrpc_kernel_check_call); - -/** - * rxrpc_kernel_retry_call - Allow a kernel service to retry a call - * @sock: The socket the call is on - * @call: The call to retry - * @srx: The address of the peer to contact - * @key: The security context to use (defaults to socket setting) - * - * Allow a kernel service to try resending a client call that failed due to a - * network error to a new address. The Tx queue is maintained intact, thereby - * relieving the need to re-encrypt any request data that has already been - * buffered. - */ -int rxrpc_kernel_retry_call(struct socket *sock, struct rxrpc_call *call, - struct sockaddr_rxrpc *srx, struct key *key) -{ - struct rxrpc_conn_parameters cp; - struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - int ret; - - _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); - - if (!key) - key = rx->key; - if (key && !key->payload.data[0]) - key = NULL; /* a no-security key */ - - memset(&cp, 0, sizeof(cp)); - cp.local = rx->local; - cp.key = key; - cp.security_level = 0; - cp.exclusive = false; - cp.service_id = srx->srx_service; - - mutex_lock(&call->user_mutex); - - ret = rxrpc_prepare_call_for_retry(rx, call); - if (ret == 0) - ret = rxrpc_retry_client_call(rx, call, &cp, srx, GFP_KERNEL); - - mutex_unlock(&call->user_mutex); - rxrpc_put_peer(cp.peer); - _leave(" = %d", ret); - return ret; -} -EXPORT_SYMBOL(rxrpc_kernel_retry_call); - /** * rxrpc_kernel_new_call_notification - Get notifications of new calls * @sock: The socket to intercept received messages on diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index bc628acf4f4f..4b1a534d290a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -476,7 +476,6 @@ enum rxrpc_call_flag { RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ - RXRPC_CALL_TX_LASTQ, /* Last packet has been queued */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_PINGING, /* Ping in process */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ @@ -517,6 +516,18 @@ enum rxrpc_call_state { NR__RXRPC_CALL_STATES }; +/* + * Call completion condition (state == RXRPC_CALL_COMPLETE). + */ +enum rxrpc_call_completion { + RXRPC_CALL_SUCCEEDED, /* - Normal termination */ + RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ + RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ + RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ + RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ + NR__RXRPC_CALL_COMPLETIONS +}; + /* * Call Tx congestion management modes. */ @@ -761,15 +772,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct sockaddr_rxrpc *, struct rxrpc_call_params *, gfp_t, unsigned int); -int rxrpc_retry_client_call(struct rxrpc_sock *, - struct rxrpc_call *, - struct rxrpc_conn_parameters *, - struct sockaddr_rxrpc *, - gfp_t); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); -int rxrpc_prepare_call_for_retry(struct rxrpc_sock *, struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); bool __rxrpc_queue_call(struct rxrpc_call *); bool rxrpc_queue_call(struct rxrpc_call *); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 8f1a8f85b1f9..8aa2937b069f 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -324,48 +324,6 @@ error: return ERR_PTR(ret); } -/* - * Retry a call to a new address. It is expected that the Tx queue of the call - * will contain data previously packaged for an old call. - */ -int rxrpc_retry_client_call(struct rxrpc_sock *rx, - struct rxrpc_call *call, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) -{ - const void *here = __builtin_return_address(0); - int ret; - - /* Set up or get a connection record and set the protocol parameters, - * including channel number and call ID. - */ - ret = rxrpc_connect_call(rx, call, cp, srx, gfp); - if (ret < 0) - goto error; - - trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage), - here, NULL); - - rxrpc_start_call_timer(call); - - _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); - - if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) - rxrpc_queue_call(call); - - _leave(" = 0"); - return 0; - -error: - rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - RX_CALL_DEAD, ret); - trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), - here, ERR_PTR(ret)); - _leave(" = %d", ret); - return ret; -} - /* * Set up an incoming call. call->conn points to the connection. * This is called in BH context and isn't allowed to fail. @@ -533,61 +491,6 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) _leave(""); } -/* - * Prepare a kernel service call for retry. - */ -int rxrpc_prepare_call_for_retry(struct rxrpc_sock *rx, struct rxrpc_call *call) -{ - const void *here = __builtin_return_address(0); - int i; - u8 last = 0; - - _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage)); - - trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage), - here, (const void *)call->flags); - - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); - ASSERTCMP(call->completion, !=, RXRPC_CALL_REMOTELY_ABORTED); - ASSERTCMP(call->completion, !=, RXRPC_CALL_LOCALLY_ABORTED); - ASSERT(list_empty(&call->recvmsg_link)); - - del_timer_sync(&call->timer); - - _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, call->conn); - - if (call->conn) - rxrpc_disconnect_call(call); - - if (rxrpc_is_service_call(call) || - !call->tx_phase || - call->tx_hard_ack != 0 || - call->rx_hard_ack != 0 || - call->rx_top != 0) - return -EINVAL; - - call->state = RXRPC_CALL_UNINITIALISED; - call->completion = RXRPC_CALL_SUCCEEDED; - call->call_id = 0; - call->cid = 0; - call->cong_cwnd = 0; - call->cong_extra = 0; - call->cong_ssthresh = 0; - call->cong_mode = 0; - call->cong_dup_acks = 0; - call->cong_cumul_acks = 0; - call->acks_lowest_nak = 0; - - for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) { - last |= call->rxtx_annotations[i]; - call->rxtx_annotations[i] &= RXRPC_TX_ANNO_LAST; - call->rxtx_annotations[i] |= RXRPC_TX_ANNO_RETRANS; - } - - _leave(" = 0"); - return 0; -} - /* * release all the calls associated with a socket */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 521189f4b666..b2adfa825363 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -562,10 +562,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); write_lock_bh(&call->state_lock); - if (!test_bit(RXRPC_CALL_TX_LASTQ, &call->flags)) - call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; - else - call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; write_unlock_bh(&call->state_lock); rxrpc_see_call(call); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index be01f9c5d963..46c9312085b1 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -169,10 +169,8 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, ASSERTCMP(seq, ==, call->tx_top + 1); - if (last) { + if (last) annotation |= RXRPC_TX_ANNO_LAST; - set_bit(RXRPC_CALL_TX_LASTQ, &call->flags); - } /* We have to set the timestamp before queueing as the retransmit * algorithm can see the packet as soon as we queue it. @@ -386,6 +384,11 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, call->tx_total_len -= copy; } + /* check for the far side aborting the call or a network error + * occurring */ + if (call->state == RXRPC_CALL_COMPLETE) + goto call_terminated; + /* add the packet to the send queue if it's now full */ if (sp->remain <= 0 || (msg_data_left(msg) == 0 && !more)) { @@ -425,16 +428,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, notify_end_tx); skb = NULL; } - - /* Check for the far side aborting the call or a network error - * occurring. If this happens, save any packet that was under - * construction so that in the case of a network error, the - * call can be retried or redirected. - */ - if (call->state == RXRPC_CALL_COMPLETE) { - ret = call->error; - goto out; - } } while (msg_data_left(msg) > 0); success: @@ -444,6 +437,11 @@ out: _leave(" = %d", ret); return ret; +call_terminated: + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); + _leave(" = %d", call->error); + return call->error; + maybe_error: if (copied) goto success; -- cgit v1.2.3 From 9174c3df1cd181c14913138d50ccbe539bb08335 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 10 Jan 2019 20:21:02 +0100 Subject: net/sched: act_tunnel_key: fix memory leak in case of action replace running the following TDC test cases: 7afc - Replace tunnel_key set action with all parameters 364d - Replace tunnel_key set action with all parameters and cookie it's possible to trigger kmemleak warnings like: unreferenced object 0xffff94797127ab40 (size 192): comm "tc", pid 3248, jiffies 4300565293 (age 1006.862s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 c0 93 f9 8a ff ff ff ff ................ 41 84 ee 89 ff ff ff ff 00 00 00 00 00 00 00 00 A............... backtrace: [<000000001e85b61c>] tunnel_key_init+0x31d/0x820 [act_tunnel_key] [<000000007f3f6ee7>] tcf_action_init_1+0x384/0x4c0 [<00000000e89e3ded>] tcf_action_init+0x12b/0x1a0 [<00000000c1c8c0f8>] tcf_action_add+0x73/0x170 [<0000000095a9fc28>] tc_ctl_action+0x122/0x160 [<000000004bebeac5>] rtnetlink_rcv_msg+0x263/0x2d0 [<000000009fd862dd>] netlink_rcv_skb+0x4a/0x110 [<00000000b55199e7>] netlink_unicast+0x1a0/0x250 [<000000004996cd21>] netlink_sendmsg+0x2c1/0x3c0 [<000000004d6a94b4>] sock_sendmsg+0x36/0x40 [<000000005d9f0208>] ___sys_sendmsg+0x280/0x2f0 [<00000000dec19023>] __sys_sendmsg+0x5e/0xa0 [<000000004b82ac81>] do_syscall_64+0x5b/0x180 [<00000000a0f1209a>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<000000002926b2ab>] 0xffffffffffffffff when the tunnel_key action is replaced, the kernel forgets to release the dst metadata: ensure they are released by tunnel_key_init(), the same way it's done in tunnel_key_release(). Fixes: d0f6dd8a914f4 ("net/sched: Introduce act_tunnel_key") Signed-off-by: Davide Caratti Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_tunnel_key.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index c3b90fadaff6..8b43fe0130f7 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -197,6 +197,15 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 }, }; +static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) +{ + if (!p) + return; + if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET) + dst_release(&p->tcft_enc_metadata->dst); + kfree_rcu(p, rcu); +} + static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, @@ -360,8 +369,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, rcu_swap_protected(t->params, params_new, lockdep_is_held(&t->tcf_lock)); spin_unlock_bh(&t->tcf_lock); - if (params_new) - kfree_rcu(params_new, rcu); + tunnel_key_release_params(params_new); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -385,12 +393,7 @@ static void tunnel_key_release(struct tc_action *a) struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, 1); - if (params) { - if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) - dst_release(¶ms->tcft_enc_metadata->dst); - - kfree_rcu(params, rcu); - } + tunnel_key_release_params(params); } static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, -- cgit v1.2.3 From 13d7f46386e060df31b727c9975e38306fa51e7a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 10 Jan 2019 14:40:33 -0500 Subject: tcp: allow MSG_ZEROCOPY transmission also in CLOSE_WAIT state TCP transmission with MSG_ZEROCOPY fails if the peer closes its end of the connection and so transitions this socket to CLOSE_WAIT state. Transmission in close wait state is acceptable. Other similar tests in the stack (e.g., in FastOpen) accept both states. Relax this test, too. Link: https://www.mail-archive.com/netdev@vger.kernel.org/msg276886.html Link: https://www.mail-archive.com/netdev@vger.kernel.org/msg227390.html Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY") Reported-by: Marek Majkowski Signed-off-by: Willem de Bruijn CC: Yuchung Cheng CC: Neal Cardwell CC: Soheil Hassas Yeganeh CC: Alexey Kodanev Acked-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 27e2f6837062..2079145a3b7c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1186,7 +1186,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) flags = msg->msg_flags; if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { - if (sk->sk_state != TCP_ESTABLISHED) { + if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { err = -EINVAL; goto out_err; } -- cgit v1.2.3 From 26fc181e6cacacd4837da7ffe0c871134a421600 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Jan 2019 06:27:35 -0800 Subject: fou, fou6: do not assume linear skbs Both gue_err() and gue6_err() incorrectly assume linear skbs. Fix them to use pskb_may_pull(). BUG: KMSAN: uninit-value in gue6_err+0x475/0xc40 net/ipv6/fou6.c:101 CPU: 0 PID: 18083 Comm: syz-executor1 Not tainted 5.0.0-rc1+ #7 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x173/0x1d0 lib/dump_stack.c:113 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:600 __msan_warning+0x82/0xf0 mm/kmsan/kmsan_instr.c:313 gue6_err+0x475/0xc40 net/ipv6/fou6.c:101 __udp6_lib_err_encap_no_sk net/ipv6/udp.c:434 [inline] __udp6_lib_err_encap net/ipv6/udp.c:491 [inline] __udp6_lib_err+0x18d0/0x2590 net/ipv6/udp.c:522 udplitev6_err+0x118/0x130 net/ipv6/udplite.c:27 icmpv6_notify+0x462/0x9f0 net/ipv6/icmp.c:784 icmpv6_rcv+0x18ac/0x3fa0 net/ipv6/icmp.c:872 ip6_protocol_deliver_rcu+0xb5a/0x23a0 net/ipv6/ip6_input.c:394 ip6_input_finish net/ipv6/ip6_input.c:434 [inline] NF_HOOK include/linux/netfilter.h:289 [inline] ip6_input+0x2b6/0x350 net/ipv6/ip6_input.c:443 dst_input include/net/dst.h:450 [inline] ip6_rcv_finish+0x4e7/0x6d0 net/ipv6/ip6_input.c:76 NF_HOOK include/linux/netfilter.h:289 [inline] ipv6_rcv+0x34b/0x3f0 net/ipv6/ip6_input.c:272 __netif_receive_skb_one_core net/core/dev.c:4973 [inline] __netif_receive_skb net/core/dev.c:5083 [inline] process_backlog+0x756/0x10e0 net/core/dev.c:5923 napi_poll net/core/dev.c:6346 [inline] net_rx_action+0x78b/0x1a60 net/core/dev.c:6412 __do_softirq+0x53f/0x93a kernel/softirq.c:293 do_softirq_own_stack+0x49/0x80 arch/x86/entry/entry_64.S:1039 do_softirq kernel/softirq.c:338 [inline] __local_bh_enable_ip+0x16f/0x1a0 kernel/softirq.c:190 local_bh_enable+0x36/0x40 include/linux/bottom_half.h:32 rcu_read_unlock_bh include/linux/rcupdate.h:696 [inline] ip6_finish_output2+0x1d64/0x25f0 net/ipv6/ip6_output.c:121 ip6_finish_output+0xae4/0xbc0 net/ipv6/ip6_output.c:154 NF_HOOK_COND include/linux/netfilter.h:278 [inline] ip6_output+0x5ca/0x710 net/ipv6/ip6_output.c:171 dst_output include/net/dst.h:444 [inline] ip6_local_out+0x164/0x1d0 net/ipv6/output_core.c:176 ip6_send_skb+0xfa/0x390 net/ipv6/ip6_output.c:1727 udp_v6_send_skb+0x1733/0x1d20 net/ipv6/udp.c:1169 udpv6_sendmsg+0x424e/0x45d0 net/ipv6/udp.c:1466 inet_sendmsg+0x54a/0x720 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xdb9/0x11b0 net/socket.c:2116 __sys_sendmmsg+0x580/0xad0 net/socket.c:2211 __do_sys_sendmmsg net/socket.c:2240 [inline] __se_sys_sendmmsg+0xbd/0xe0 net/socket.c:2237 __x64_sys_sendmmsg+0x56/0x70 net/socket.c:2237 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x457ec9 Code: 6d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 3b b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f4a5204fc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 0000000000457ec9 RDX: 00000000040001ab RSI: 0000000020000240 RDI: 0000000000000003 RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f4a520506d4 R13: 00000000004c4ce5 R14: 00000000004d85d8 R15: 00000000ffffffff Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:205 [inline] kmsan_internal_poison_shadow+0x92/0x150 mm/kmsan/kmsan.c:159 kmsan_kmalloc+0xa6/0x130 mm/kmsan/kmsan_hooks.c:176 kmsan_slab_alloc+0xe/0x10 mm/kmsan/kmsan_hooks.c:185 slab_post_alloc_hook mm/slab.h:446 [inline] slab_alloc_node mm/slub.c:2754 [inline] __kmalloc_node_track_caller+0xe9e/0xff0 mm/slub.c:4377 __kmalloc_reserve net/core/skbuff.c:140 [inline] __alloc_skb+0x309/0xa20 net/core/skbuff.c:208 alloc_skb include/linux/skbuff.h:1012 [inline] alloc_skb_with_frags+0x1c7/0xac0 net/core/skbuff.c:5288 sock_alloc_send_pskb+0xafd/0x10a0 net/core/sock.c:2091 sock_alloc_send_skb+0xca/0xe0 net/core/sock.c:2108 __ip6_append_data+0x42ed/0x5dc0 net/ipv6/ip6_output.c:1443 ip6_append_data+0x3c2/0x650 net/ipv6/ip6_output.c:1619 icmp6_send+0x2f5c/0x3c40 net/ipv6/icmp.c:574 icmpv6_send+0xe5/0x110 net/ipv6/ip6_icmp.c:43 ip6_link_failure+0x5c/0x2c0 net/ipv6/route.c:2231 dst_link_failure include/net/dst.h:427 [inline] vti_xmit net/ipv4/ip_vti.c:229 [inline] vti_tunnel_xmit+0xf3b/0x1ea0 net/ipv4/ip_vti.c:265 __netdev_start_xmit include/linux/netdevice.h:4382 [inline] netdev_start_xmit include/linux/netdevice.h:4391 [inline] xmit_one net/core/dev.c:3278 [inline] dev_hard_start_xmit+0x604/0xc40 net/core/dev.c:3294 __dev_queue_xmit+0x2e48/0x3b80 net/core/dev.c:3864 dev_queue_xmit+0x4b/0x60 net/core/dev.c:3897 neigh_direct_output+0x42/0x50 net/core/neighbour.c:1511 neigh_output include/net/neighbour.h:508 [inline] ip6_finish_output2+0x1d4e/0x25f0 net/ipv6/ip6_output.c:120 ip6_finish_output+0xae4/0xbc0 net/ipv6/ip6_output.c:154 NF_HOOK_COND include/linux/netfilter.h:278 [inline] ip6_output+0x5ca/0x710 net/ipv6/ip6_output.c:171 dst_output include/net/dst.h:444 [inline] ip6_local_out+0x164/0x1d0 net/ipv6/output_core.c:176 ip6_send_skb+0xfa/0x390 net/ipv6/ip6_output.c:1727 udp_v6_send_skb+0x1733/0x1d20 net/ipv6/udp.c:1169 udpv6_sendmsg+0x424e/0x45d0 net/ipv6/udp.c:1466 inet_sendmsg+0x54a/0x720 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] ___sys_sendmsg+0xdb9/0x11b0 net/socket.c:2116 __sys_sendmmsg+0x580/0xad0 net/socket.c:2211 __do_sys_sendmmsg net/socket.c:2240 [inline] __se_sys_sendmmsg+0xbd/0xe0 net/socket.c:2237 __x64_sys_sendmmsg+0x56/0x70 net/socket.c:2237 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 Fixes: b8a51b38e4d4 ("fou, fou6: ICMP error handlers for FoU and GUE") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Stefano Brivio Cc: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv4/fou.c | 9 +++++++-- net/ipv6/fou6.c | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 632863541082..437070d1ffb1 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -1020,10 +1020,11 @@ static int gue_err(struct sk_buff *skb, u32 info) { int transport_offset = skb_transport_offset(skb); struct guehdr *guehdr; - size_t optlen; + size_t len, optlen; int ret; - if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) + len = sizeof(struct udphdr) + sizeof(struct guehdr); + if (!pskb_may_pull(skb, len)) return -EINVAL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; @@ -1058,6 +1059,10 @@ static int gue_err(struct sk_buff *skb, u32 info) optlen = guehdr->hlen << 2; + if (!pskb_may_pull(skb, len + optlen)) + return -EINVAL; + + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; if (validate_gue_flags(guehdr, optlen)) return -EINVAL; diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c index 7da7bf3b7fe3..b858bd5280bf 100644 --- a/net/ipv6/fou6.c +++ b/net/ipv6/fou6.c @@ -90,10 +90,11 @@ static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { int transport_offset = skb_transport_offset(skb); struct guehdr *guehdr; - size_t optlen; + size_t len, optlen; int ret; - if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) + len = sizeof(struct udphdr) + sizeof(struct guehdr); + if (!pskb_may_pull(skb, len)) return -EINVAL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; @@ -128,6 +129,10 @@ static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, optlen = guehdr->hlen << 2; + if (!pskb_may_pull(skb, len + optlen)) + return -EINVAL; + + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; if (validate_gue_flags(guehdr, optlen)) return -EINVAL; -- cgit v1.2.3 From e2612cd496e7b465711d219ea6118893d7253f52 Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Mon, 14 Jan 2019 11:24:38 -0800 Subject: xfrm: Make set-mark default behavior backward compatible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes 9b42c1f179a6, which changed the default route lookup behavior for tunnel mode SAs in the outbound direction to use the skb mark, whereas previously mark=0 was used if the output mark was unspecified. In mark-based routing schemes such as Android’s, this change in default behavior causes routing loops or lookup failures. This patch restores the default behavior of using a 0 mark while still incorporating the skb mark if the SET_MARK (and SET_MARK_MASK) is specified. Tested with additions to Android's kernel unit test suite: https://android-review.googlesource.com/c/kernel/tests/+/860150 Fixes: 9b42c1f179a6 ("xfrm: Extend the output_mark to support input direction and masking") Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 8cfd75b62396..ba0a4048c846 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2600,7 +2600,10 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst_copy_metrics(dst1, dst); if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { - __u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); + __u32 mark = 0; + + if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m) + mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); family = xfrm[i]->props.family; dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, -- cgit v1.2.3 From cd0c4e70fc0ccfa705cdf55efb27519ce9337a26 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 11 Jan 2019 18:55:42 -0800 Subject: net_sched: refetch skb protocol for each filter Martin reported a set of filters don't work after changing from reclassify to continue. Looking into the code, it looks like skb protocol is not always fetched for each iteration of the filters. But, as demonstrated by Martin, TC actions could modify skb->protocol, for example act_vlan, this means we have to refetch skb protocol in each iteration, rather than using the one we fetch in the beginning of the loop. This bug is _not_ introduced by commit 3b3ae880266d ("net: sched: consolidate tc_classify{,_compat}"), technically, if act_vlan is the only action that modifies skb protocol, then it is commit c7e2b9689ef8 ("sched: introduce vlan action") which introduced this bug. Reported-by: Martin Olsson Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_api.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8ce2a0507970..e2b5cb2eb34e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1277,7 +1277,6 @@ EXPORT_SYMBOL(tcf_block_cb_unregister); int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { - __be16 protocol = tc_skb_protocol(skb); #ifdef CONFIG_NET_CLS_ACT const int max_reclassify_loop = 4; const struct tcf_proto *orig_tp = tp; @@ -1287,6 +1286,7 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, reclassify: #endif for (; tp; tp = rcu_dereference_bh(tp->next)) { + __be16 protocol = tc_skb_protocol(skb); int err; if (tp->protocol != protocol && @@ -1319,7 +1319,6 @@ reset: } tp = first_tp; - protocol = tc_skb_protocol(skb); goto reclassify; #endif } -- cgit v1.2.3 From 04a4af334b971814eedf4e4a413343ad3287d9a9 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Mon, 14 Jan 2019 09:16:56 +0000 Subject: openvswitch: Avoid OOB read when parsing flow nlattrs For nested and variable attributes, the expected length of an attribute is not known and marked by a negative number. This results in an OOB read when the expected length is later used to check if the attribute is all zeros. Fix this by using the actual length of the attribute rather than the expected length. Signed-off-by: Ross Lagerwall Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 435a4bdf8f89..691da853bef5 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -500,7 +500,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, return -EINVAL; } - if (!nz || !is_all_zero(nla_data(nla), expected_len)) { + if (!nz || !is_all_zero(nla_data(nla), nla_len(nla))) { attrs |= 1 << type; a[type] = nla; } -- cgit v1.2.3 From 20704bd1633dd5afb29a321d3a615c9c8e9c9d05 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 14 Jan 2019 18:10:06 +0800 Subject: erspan: build the header with the right proto according to erspan_ver As said in draft-foschiano-erspan-03#section4: Different frame variants known as "ERSPAN Types" can be distinguished based on the GRE "Protocol Type" field value: Type I and II's value is 0x88BE while Type III's is 0x22EB [ETYPES]. So set it properly in erspan_xmit() according to erspan_ver. While at it, also remove the unused parameter 'proto' in erspan_fb_xmit(). Fixes: 94d7d8f29287 ("ip6_gre: add erspan v2 support") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 23 ++++++++++++++--------- net/ipv6/ip6_gre.c | 6 ++++-- 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d1d09f3e5f9e..b1a74d80d868 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -569,8 +569,7 @@ err_free_skb: dev->stats.tx_dropped++; } -static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, - __be16 proto) +static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info; @@ -578,10 +577,10 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, struct erspan_metadata *md; struct rtable *rt = NULL; bool truncate = false; + __be16 df, proto; struct flowi4 fl; int tunnel_hlen; int version; - __be16 df; int nhoff; int thoff; @@ -626,18 +625,20 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (version == 1) { erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), ntohl(md->u.index), truncate, true); + proto = htons(ETH_P_ERSPAN); } else if (version == 2) { erspan_build_header_v2(skb, ntohl(tunnel_id_to_key32(key->tun_id)), md->u.md2.dir, get_hwid(&md->u.md2), truncate, true); + proto = htons(ETH_P_ERSPAN2); } else { goto err_free_rt; } gre_build_header(skb, 8, TUNNEL_SEQ, - htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); + proto, 0, htonl(tunnel->o_seqno++)); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; @@ -721,12 +722,13 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); bool truncate = false; + __be16 proto; if (!pskb_inet_may_pull(skb)) goto free_skb; if (tunnel->collect_md) { - erspan_fb_xmit(skb, dev, skb->protocol); + erspan_fb_xmit(skb, dev); return NETDEV_TX_OK; } @@ -742,19 +744,22 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - if (tunnel->erspan_ver == 1) + if (tunnel->erspan_ver == 1) { erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, truncate, true); - else if (tunnel->erspan_ver == 2) + proto = htons(ETH_P_ERSPAN); + } else if (tunnel->erspan_ver == 2) { erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key), tunnel->dir, tunnel->hwid, truncate, true); - else + proto = htons(ETH_P_ERSPAN2); + } else { goto free_skb; + } tunnel->parms.o_flags &= ~TUNNEL_KEY; - __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); + __gre_xmit(skb, dev, &tunnel->parms.iph, proto); return NETDEV_TX_OK; free_skb: diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 026f08735549..b1be67ca6768 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -922,6 +922,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, __u8 dsfield = false; struct flowi6 fl6; int err = -EINVAL; + __be16 proto; __u32 mtu; int nhoff; int thoff; @@ -1035,8 +1036,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, } /* Push GRE header. */ - gre_build_header(skb, 8, TUNNEL_SEQ, - htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++)); + proto = (t->parms.erspan_ver == 1) ? htons(ETH_P_ERSPAN) + : htons(ETH_P_ERSPAN2); + gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(t->o_seqno++)); /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) -- cgit v1.2.3 From 400b8b9a2a17918f8ce00786f596f530e7f30d50 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 14 Jan 2019 18:34:02 +0800 Subject: sctp: allocate sctp_sockaddr_entry with kzalloc The similar issue as fixed in Commit 4a2eb0c37b47 ("sctp: initialize sin6_flowinfo for ipv6 addrs in sctp_inet6addr_event") also exists in sctp_inetaddr_event, as Alexander noticed. To fix it, allocate sctp_sockaddr_entry with kzalloc for both sctp ipv4 and ipv6 addresses, as does in sctp_v4/6_copy_addrlist(). Reported-by: Alexander Potapenko Signed-off-by: Xin Long Reported-by: syzbot+ae0c70c0c2d40c51bb92@syzkaller.appspotmail.com Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 5 +---- net/sctp/protocol.c | 4 +--- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index b9ed271b7ef7..ed8e006dae85 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -97,11 +97,9 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, switch (ev) { case NETDEV_UP: - addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC); + addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; - addr->a.v6.sin6_port = 0; - addr->a.v6.sin6_flowinfo = 0; addr->a.v6.sin6_addr = ifa->addr; addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex; addr->valid = 1; @@ -434,7 +432,6 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist, addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; - addr->a.v6.sin6_port = 0; addr->a.v6.sin6_addr = ifp->addr; addr->a.v6.sin6_scope_id = dev->ifindex; addr->valid = 1; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index d5878ae55840..4e0eeb113ef5 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -101,7 +101,6 @@ static void sctp_v4_copy_addrlist(struct list_head *addrlist, addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v4.sin_family = AF_INET; - addr->a.v4.sin_port = 0; addr->a.v4.sin_addr.s_addr = ifa->ifa_local; addr->valid = 1; INIT_LIST_HEAD(&addr->list); @@ -776,10 +775,9 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev, switch (ev) { case NETDEV_UP: - addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC); + addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v4.sin_family = AF_INET; - addr->a.v4.sin_port = 0; addr->a.v4.sin_addr.s_addr = ifa->ifa_local; addr->valid = 1; spin_lock_bh(&net->sctp.local_addr_lock); -- cgit v1.2.3 From 01b833ab44c9e484060aad72267fc7e71beb559b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 14 Jan 2019 13:38:43 +0300 Subject: net/core/neighbour: fix kmemleak minimal reference count for hash tables This should be 1 for normal allocations, 0 disables leak reporting. Signed-off-by: Konstantin Khlebnikov Reported-by: Cong Wang Fixes: 85704cb8dcfd ("net/core/neighbour: tell kmemleak about hash tables") Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3e27a779f288..96fdc9134726 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -450,7 +450,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) buckets = (struct neighbour __rcu **) __get_free_pages(GFP_ATOMIC | __GFP_ZERO, get_order(size)); - kmemleak_alloc(buckets, size, 0, GFP_ATOMIC); + kmemleak_alloc(buckets, size, 1, GFP_ATOMIC); } if (!buckets) { kfree(ret); -- cgit v1.2.3 From a5a82d841186d13c4a6d500dfcf7d02b4195e3ff Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Jan 2019 10:52:45 -0800 Subject: ipv6: route: place a warning with duplicated string with correct extack "IPv6: " prefix is already added by pr_fmt, no need to include it again in the pr_warn() format. The message predates extack support, we can replace the whole thing with an extack message. Suggested-by: David Ahern Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 40b225f87d5e..964491cf3672 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4251,17 +4251,6 @@ struct rt6_nh { struct list_head next; }; -static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) -{ - struct rt6_nh *nh; - - list_for_each_entry(nh, rt6_nh_list, next) { - pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", - &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, - nh->r_cfg.fc_ifindex); - } -} - static int ip6_route_info_append(struct net *net, struct list_head *rt6_nh_list, struct fib6_info *rt, @@ -4407,7 +4396,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nh->fib6_info = NULL; if (err) { if (replace && nhn) - ip6_print_replace_route_err(&rt6_nh_list); + NL_SET_ERR_MSG_MOD(extack, + "multipath route replace failed (check consistency of installed routes)"); err_nh = nh; goto add_errout; } -- cgit v1.2.3 From 1a9352687c19e4937d861ff2c5c6fc45c0a08aff Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 16 Jan 2019 01:35:22 +0900 Subject: net: bpfilter: change section name of bpfilter UMH blob. The section of bpfilter UMH blob is the ".bpfilter_umh". but this is not an explicit section. so linking warning occurred at compile time for the powerpc. So, this patch makes use of the ".rodata" instead of the ".bpfilter_umh". Config condition: CONFIG_BPFILTER=y CONFIG_BPFILTER_UMH=y Result: ld: warning: orphan section `.bpfilter_umh' from `net/bpfilter/bpfilter_umh_blob.o' being placed in section `.bpfilter_umh' Fixes: 61fbf5933d42 ("net: bpfilter: restart bpfilter_umh when error occurred") Reported-by: Stephen Rothwell Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_umh_blob.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S index 7f1c521dcc2f..9ea6100dca87 100644 --- a/net/bpfilter/bpfilter_umh_blob.S +++ b/net/bpfilter/bpfilter_umh_blob.S @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ - .section .bpfilter_umh, "a" + .section .rodata, "a" .global bpfilter_umh_start bpfilter_umh_start: .incbin "net/bpfilter/bpfilter_umh" -- cgit v1.2.3 From 0f149c9fec3cd720628ecde83bfc6f64c1e7dcb6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 15 Jan 2019 11:40:02 -0500 Subject: udp: with udp_segment release on error path Failure __ip_append_data triggers udp_flush_pending_frames, but these tests happen later. The skb must be freed directly. Fixes: bec1f6f697362 ("udp: generate gso with UDP_SEGMENT") Reported-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 16 ++++++++++++---- net/ipv6/udp.c | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3fb0ed5e4789..3d2a81bdc2ab 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -847,15 +847,23 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) + if (hlen + cork->gso_size > cork->fragsize) { + kfree_skb(skb); return -EINVAL; - if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) + } + if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) { + kfree_skb(skb); return -EINVAL; - if (sk->sk_no_check_tx) + } + if (sk->sk_no_check_tx) { + kfree_skb(skb); return -EINVAL; + } if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || - dst_xfrm(skb_dst(skb))) + dst_xfrm(skb_dst(skb))) { + kfree_skb(skb); return -EIO; + } skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7c3505006f8e..e1f2b9660666 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1132,15 +1132,23 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) + if (hlen + cork->gso_size > cork->fragsize) { + kfree_skb(skb); return -EINVAL; - if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) + } + if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) { + kfree_skb(skb); return -EINVAL; - if (udp_sk(sk)->no_check6_tx) + } + if (udp_sk(sk)->no_check6_tx) { + kfree_skb(skb); return -EINVAL; + } if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || - dst_xfrm(skb_dst(skb))) + dst_xfrm(skb_dst(skb))) { + kfree_skb(skb); return -EIO; + } skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; -- cgit v1.2.3 From c61c27687a5abce11431e6de1adb6e36099b9859 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 16 Jan 2019 20:35:41 +0100 Subject: bpf: Correctly annotate implicit fall through in bpf_base_func_proto MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a plan to build the kernel with -Wimplicit-fallthrough and this place in the code produced a warnings (W=1). To preserve as much of the existing comment only change a ‘:’ into a ‘,’. This is enough change, to match the regular expression expected by GCC. This commit removes the following warning: net/core/filter.c:5310:6: warning: this statement may fall through [-Wimplicit-fallthrough=] Signed-off-by: Mathieu Malaterre Signed-off-by: Daniel Borkmann --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 2b3b436ef545..d9076e609fca 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5309,7 +5309,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); - /* else: fall through */ + /* else, fall through */ default: return NULL; } -- cgit v1.2.3 From 2cddd20147826aef283115abb00012d4dafe3cdb Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 16 Jan 2019 16:53:52 +0100 Subject: net/sched: cls_flower: allocate mask dynamically in fl_change() Recent changes (especially 05cd271fd61a ("cls_flower: Support multiple masks per priority")) in the fl_flow_mask structure grow it and its current size e.g. on x86_64 with defconfig is 760 bytes and more than 1024 bytes with some debug options enabled. Prior the mentioned commit its size was 176 bytes (using defconfig on x86_64). With regard to this fact it's reasonable to allocate this structure dynamically in fl_change() to reduce its stack size. v2: - use kzalloc() instead of kcalloc() Fixes: 05cd271fd61a ("cls_flower: Support multiple masks per priority") Cc: Jiri Pirko Cc: Paul Blakey Acked-by: Jiri Pirko Signed-off-by: Ivan Vecera Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index dad04e710493..f6aa57fbbbaf 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1290,17 +1290,23 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *fold = *arg; struct cls_fl_filter *fnew; + struct fl_flow_mask *mask; struct nlattr **tb; - struct fl_flow_mask mask = {}; int err; if (!tca[TCA_OPTIONS]) return -EINVAL; - tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); - if (!tb) + mask = kzalloc(sizeof(struct fl_flow_mask), GFP_KERNEL); + if (!mask) return -ENOBUFS; + tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); + if (!tb) { + err = -ENOBUFS; + goto errout_mask_alloc; + } + err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy, NULL); if (err < 0) @@ -1343,12 +1349,12 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } } - err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr, + err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], ovr, tp->chain->tmplt_priv, extack); if (err) goto errout_idr; - err = fl_check_assign_mask(head, fnew, fold, &mask); + err = fl_check_assign_mask(head, fnew, fold, mask); if (err) goto errout_idr; @@ -1392,6 +1398,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } kfree(tb); + kfree(mask); return 0; errout_mask: @@ -1405,6 +1412,8 @@ errout: kfree(fnew); errout_tb: kfree(tb); +errout_mask_alloc: + kfree(mask); return err; } -- cgit v1.2.3 From 8f6b5392856a4b74224e257f3e0874a163b04603 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Wed, 16 Jan 2019 19:17:44 +0300 Subject: udp: add missing rehash callback to udplite After commit 4cdeeee9252a ("net: udp: prefer listeners bound to an address"), UDP-Lite only works when specifying a local address for the sockets. This is related to the problem addressed in the commit 719f835853a9 ("udp: add rehash on connect()"). Moreover, __udp4_lib_lookup() now looks for a socket immediately in the secondary hash table. The issue was found with LTP/network tests (UDP-Lite test-cases). Fixes: 4cdeeee9252a ("net: udp: prefer listeners bound to an address") Signed-off-by: Alexey Kodanev Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- net/ipv4/udp_impl.h | 1 + net/ipv4/udplite.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3d2a81bdc2ab..5c3cd5d84a6f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1926,7 +1926,7 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) } EXPORT_SYMBOL(udp_lib_rehash); -static void udp_v4_rehash(struct sock *sk) +void udp_v4_rehash(struct sock *sk) { u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 322672655419..6b2fa77eeb1c 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -10,6 +10,7 @@ int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); int udp_v4_get_port(struct sock *sk, unsigned short snum); +void udp_v4_rehash(struct sock *sk); int udp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 39c7f17d916f..3c94b8f0ff27 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -53,6 +53,7 @@ struct proto udplite_prot = { .sendpage = udp_sendpage, .hash = udp_lib_hash, .unhash = udp_lib_unhash, + .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, -- cgit v1.2.3 From f7c46156f4a9d6ba5c6bcc5c48945e87b0f08c65 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Wed, 16 Jan 2019 19:17:45 +0300 Subject: udp6: add missing rehash callback to udplite After commit 23b0269e58ae ("net: udp6: prefer listeners bound to an address"), UDP-Lite only works when specifying a local address for the sockets. This is related to the problem addressed in the commit 719f835853a9 ("udp: add rehash on connect()"). Moreover, __udp6_lib_lookup() now looks for a socket immediately in the secondary hash table. And this issue was found with LTP/network tests as well. Fixes: 23b0269e58ae ("net: udp6: prefer listeners bound to an address") Signed-off-by: Alexey Kodanev Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/udp.c | 2 +- net/ipv6/udp_impl.h | 1 + net/ipv6/udplite.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e1f2b9660666..2596ffdeebea 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -102,7 +102,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, hash2_nulladdr); } -static void udp_v6_rehash(struct sock *sk) +void udp_v6_rehash(struct sock *sk) { u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 5730e6503cb4..20e324b6f358 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -13,6 +13,7 @@ int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, __be32, struct udp_table *); int udp_v6_get_port(struct sock *sk, unsigned short snum); +void udp_v6_rehash(struct sock *sk); int udpv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index a125aebc29e5..f35907836444 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -49,6 +49,7 @@ struct proto udplitev6_prot = { .recvmsg = udpv6_recvmsg, .hash = udp_lib_hash, .unhash = udp_lib_unhash, + .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, -- cgit v1.2.3 From f4924f24da8c7ef64195096817f3cde324091d97 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 16 Jan 2019 08:47:54 -0800 Subject: bpf: bpf_setsockopt: reset sock dst on SO_MARK changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In sock_setsockopt() (net/core/sock.h), when SO_MARK option is used to change sk_mark, sk_dst_reset(sk) is called. The same should be done in bpf_setsockopt(). Fixes: 8c4b4c7e9ff0 ("bpf: Add setsockopt helper function to bpf") Reported-by: Maciej Å»enczykowski Signed-off-by: Peter Oskolkov Acked-by: Martin KaFai Lau Reviewed-by: Maciej Å»enczykowski Signed-off-by: Daniel Borkmann --- net/core/filter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index d9076e609fca..d9ea51b47f35 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4132,7 +4132,10 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_rcvlowat = val ? : 1; break; case SO_MARK: - sk->sk_mark = val; + if (sk->sk_mark != val) { + sk->sk_mark = val; + sk_dst_reset(sk); + } break; default: ret = -EINVAL; -- cgit v1.2.3 From e224c390a6259c529f7b2a6bd215a087b3344f5c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 17 Jan 2019 08:51:01 -0800 Subject: bpf: fix SO_MAX_PACING_RATE to support TCP internal pacing If sch_fq packet scheduler is not used, TCP can fallback to internal pacing, but this requires sk_pacing_status to be properly set. Fixes: 8c4b4c7e9ff0 ("bpf: Add setsockopt helper function to bpf") Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Cc: Lawrence Brakmo Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- net/core/filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index d9ea51b47f35..dab10d21cae8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4119,6 +4119,10 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); break; case SO_MAX_PACING_RATE: /* 32bit version */ + if (val != ~0U) + cmpxchg(&sk->sk_pacing_status, + SK_PACING_NONE, + SK_PACING_NEEDED); sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); -- cgit v1.2.3 From 88a8121dc1d3d0dbddd411b79ed236b6b6ea415c Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 17 Jan 2019 11:27:22 +0100 Subject: af_packet: fix raw sockets over 6in4 tunnel Since commit cb9f1b783850, scapy (which uses an AF_PACKET socket in SOCK_RAW mode) is unable to send a basic icmp packet over a sit tunnel: Here is a example of the setup: $ ip link set ntfp2 up $ ip addr add 10.125.0.1/24 dev ntfp2 $ ip tunnel add tun1 mode sit ttl 64 local 10.125.0.1 remote 10.125.0.2 dev ntfp2 $ ip addr add fd00:cafe:cafe::1/128 dev tun1 $ ip link set dev tun1 up $ ip route add fd00:200::/64 dev tun1 $ scapy >>> p = [] >>> p += IPv6(src='fd00:100::1', dst='fd00:200::1')/ICMPv6EchoRequest() >>> send(p, count=1, inter=0.1) >>> quit() $ ip -s link ls dev tun1 | grep -A1 "TX.*errors" TX: bytes packets errors dropped carrier collsns 0 0 1 0 0 0 The problem is that the network offset is set to the hard_header_len of the output device (tun1, ie 14 + 20) and in our case, because the packet is small (48 bytes) the pskb_inet_may_pull() fails (it tries to pull 40 bytes (ipv6 header) starting from the network offset). This problem is more generally related to device with variable hard header length. To avoid a too intrusive patch in the current release, a (ugly) workaround is proposed in this patch. It has to be cleaned up in net-next. Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=993675a3100b1 Link: http://patchwork.ozlabs.org/patch/1024489/ Fixes: cb9f1b783850 ("ip: validate header length on virtual device xmit") CC: Willem de Bruijn CC: Maxim Mikityanskiy Signed-off-by: Nicolas Dichtel Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d0945253f43b..3b1a78906bc0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2887,7 +2887,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_free; } else if (reserve) { skb_reserve(skb, -reserve); - if (len < reserve) + if (len < reserve + sizeof(struct ipv6hdr) && + dev->min_header_len != dev->hard_header_len) skb_reset_network_header(skb); } -- cgit v1.2.3 From 12c44aba6618b7f6c437076e5722237190f6cd5f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Jan 2019 14:28:48 +0100 Subject: netfilter: nft_compat: use refcnt_t type for nft_xt reference count Using standard integer type was fine while all operations on it were guarded by the nftnl subsys mutex. This isn't true anymore: 1. transactions are guarded only by a pernet mutex, so concurrent rule manipulation in different netns is racy 2. the ->destroy hook runs from a work queue after the transaction mutex has been released already. cpu0 cpu1 (net 1) cpu2 (net 2) kworker nft_compat->destroy nft_compat->init nft_compat->init if (--nft_xt->ref == 0) nft_xt->ref++ nft_xt->ref++ Switch to refcount_t. Doing this however only fixes a minor aspect, nft_compat also performs linked-list operations in an unsafe way. This is addressed in the next two patches. Fixes: f102d66b335a ("netfilter: nf_tables: use dedicated mutex to guard transactions") Fixes: 0935d5588400 ("netfilter: nf_tables: asynchronous release") Reported-by: Taehee Yoo Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 7334e0b80a5e..acc85acad31b 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -26,7 +26,7 @@ struct nft_xt { struct list_head head; struct nft_expr_ops ops; - unsigned int refcnt; + refcount_t refcnt; /* Unlike other expressions, ops doesn't have static storage duration. * nft core assumes they do. We use kfree_rcu so that nft core can @@ -45,7 +45,7 @@ struct nft_xt_match_priv { static bool nft_xt_put(struct nft_xt *xt) { - if (--xt->refcnt == 0) { + if (refcount_dec_and_test(&xt->refcnt)) { list_del(&xt->head); kfree_rcu(xt, rcu_head); return true; @@ -273,7 +273,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; nft_xt = container_of(expr->ops, struct nft_xt, ops); - nft_xt->refcnt++; + refcount_inc(&nft_xt->refcnt); return 0; } @@ -486,7 +486,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return ret; nft_xt = container_of(expr->ops, struct nft_xt, ops); - nft_xt->refcnt++; + refcount_inc(&nft_xt->refcnt); return 0; } @@ -789,7 +789,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, goto err; } - nft_match->refcnt = 0; + refcount_set(&nft_match->refcnt, 0); nft_match->ops.type = &nft_match_type; nft_match->ops.eval = nft_match_eval; nft_match->ops.init = nft_match_init; @@ -893,7 +893,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, goto err; } - nft_target->refcnt = 0; + refcount_set(&nft_target->refcnt, 0); nft_target->ops.type = &nft_target_type; nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); nft_target->ops.init = nft_target_init; @@ -964,7 +964,7 @@ static void __exit nft_compat_module_exit(void) list_for_each_entry_safe(xt, next, &nft_target_list, head) { struct xt_target *target = xt->ops.data; - if (WARN_ON_ONCE(xt->refcnt)) + if (WARN_ON_ONCE(refcount_read(&xt->refcnt))) continue; module_put(target->me); kfree(xt); @@ -973,7 +973,7 @@ static void __exit nft_compat_module_exit(void) list_for_each_entry_safe(xt, next, &nft_match_list, head) { struct xt_match *match = xt->ops.data; - if (WARN_ON_ONCE(xt->refcnt)) + if (WARN_ON_ONCE(refcount_read(&xt->refcnt))) continue; module_put(match->me); kfree(xt); -- cgit v1.2.3 From cf52572ebbd7189a1966c2b5fc34b97078cd1dce Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Jan 2019 14:28:49 +0100 Subject: netfilter: nft_compat: make lists per netns There are two problems with nft_compat since the netlink config plane uses a per-netns mutex: 1. Concurrent add/del accesses to the same list 2. accesses to a list element after it has been free'd already. This patch fixes the first problem. Freeing occurs from a work queue, after transaction mutexes have been released, i.e., it still possible for a new transaction (even from same net ns) to find the to-be-deleted expression in the list. The ->destroy functions are not allowed to have any such side effects, i.e. the list_del() in the destroy function is not allowed. This part of the problem is solved in the next patch. I tried to make this work by serializing list access via mutex and by moving list_del() to a deactivate callback, but Taehee spotted following race on this approach: NET #0 NET #1 >select_ops() ->init() ->select_ops() ->deactivate() ->destroy() nft_xt_put() kfree_rcu(xt, rcu_head); ->init() <-- use-after-free occurred. Unfortunately, we can't increment reference count in select_ops(), because we can't undo the refcount increase in case a different expression fails in the same batch. (The destroy hook will only be called in case the expression was initialized successfully). Fixes: f102d66b335a ("netfilter: nf_tables: use dedicated mutex to guard transactions") Reported-by: Taehee Yoo Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 129 +++++++++++++++++++++++++++++++-------------- 1 file changed, 89 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index acc85acad31b..abed3490a8f8 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -22,6 +22,7 @@ #include #include #include +#include struct nft_xt { struct list_head head; @@ -43,6 +44,20 @@ struct nft_xt_match_priv { void *info; }; +struct nft_compat_net { + struct list_head nft_target_list; + struct list_head nft_match_list; +}; + +static unsigned int nft_compat_net_id __read_mostly; +static struct nft_expr_type nft_match_type; +static struct nft_expr_type nft_target_type; + +static struct nft_compat_net *nft_compat_pernet(struct net *net) +{ + return net_generic(net, nft_compat_net_id); +} + static bool nft_xt_put(struct nft_xt *xt) { if (refcount_dec_and_test(&xt->refcnt)) { @@ -734,10 +749,6 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = { .cb = nfnl_nft_compat_cb, }; -static LIST_HEAD(nft_match_list); - -static struct nft_expr_type nft_match_type; - static bool nft_match_cmp(const struct xt_match *match, const char *name, u32 rev, u32 family) { @@ -749,6 +760,7 @@ static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { + struct nft_compat_net *cn; struct nft_xt *nft_match; struct xt_match *match; unsigned int matchsize; @@ -765,8 +777,10 @@ nft_match_select_ops(const struct nft_ctx *ctx, rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV])); family = ctx->family; + cn = nft_compat_pernet(ctx->net); + /* Re-use the existing match if it's already loaded. */ - list_for_each_entry(nft_match, &nft_match_list, head) { + list_for_each_entry(nft_match, &cn->nft_match_list, head) { struct xt_match *match = nft_match->ops.data; if (nft_match_cmp(match, mt_name, rev, family)) @@ -810,7 +824,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, nft_match->ops.size = matchsize; - list_add(&nft_match->head, &nft_match_list); + list_add(&nft_match->head, &cn->nft_match_list); return &nft_match->ops; err: @@ -826,10 +840,6 @@ static struct nft_expr_type nft_match_type __read_mostly = { .owner = THIS_MODULE, }; -static LIST_HEAD(nft_target_list); - -static struct nft_expr_type nft_target_type; - static bool nft_target_cmp(const struct xt_target *tg, const char *name, u32 rev, u32 family) { @@ -841,6 +851,7 @@ static const struct nft_expr_ops * nft_target_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { + struct nft_compat_net *cn; struct nft_xt *nft_target; struct xt_target *target; char *tg_name; @@ -861,8 +872,9 @@ nft_target_select_ops(const struct nft_ctx *ctx, strcmp(tg_name, "standard") == 0) return ERR_PTR(-EINVAL); + cn = nft_compat_pernet(ctx->net); /* Re-use the existing target if it's already loaded. */ - list_for_each_entry(nft_target, &nft_target_list, head) { + list_for_each_entry(nft_target, &cn->nft_target_list, head) { struct xt_target *target = nft_target->ops.data; if (!target->target) @@ -907,7 +919,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, else nft_target->ops.eval = nft_target_eval_xt; - list_add(&nft_target->head, &nft_target_list); + list_add(&nft_target->head, &cn->nft_target_list); return &nft_target->ops; err: @@ -923,13 +935,74 @@ static struct nft_expr_type nft_target_type __read_mostly = { .owner = THIS_MODULE, }; +static int __net_init nft_compat_init_net(struct net *net) +{ + struct nft_compat_net *cn = nft_compat_pernet(net); + + INIT_LIST_HEAD(&cn->nft_target_list); + INIT_LIST_HEAD(&cn->nft_match_list); + + return 0; +} + +static void __net_exit nft_compat_exit_net(struct net *net) +{ + struct nft_compat_net *cn = nft_compat_pernet(net); + struct nft_xt *xt, *next; + + if (list_empty(&cn->nft_match_list) && + list_empty(&cn->nft_target_list)) + return; + + /* If there was an error that caused nft_xt expr to not be initialized + * fully and noone else requested the same expression later, the lists + * contain 0-refcount entries that still hold module reference. + * + * Clean them here. + */ + mutex_lock(&net->nft.commit_mutex); + list_for_each_entry_safe(xt, next, &cn->nft_target_list, head) { + struct xt_target *target = xt->ops.data; + + list_del_init(&xt->head); + + if (refcount_read(&xt->refcnt)) + continue; + module_put(target->me); + kfree(xt); + } + + list_for_each_entry_safe(xt, next, &cn->nft_match_list, head) { + struct xt_match *match = xt->ops.data; + + list_del_init(&xt->head); + + if (refcount_read(&xt->refcnt)) + continue; + module_put(match->me); + kfree(xt); + } + mutex_unlock(&net->nft.commit_mutex); +} + +static struct pernet_operations nft_compat_net_ops = { + .init = nft_compat_init_net, + .exit = nft_compat_exit_net, + .id = &nft_compat_net_id, + .size = sizeof(struct nft_compat_net), +}; + static int __init nft_compat_module_init(void) { int ret; + ret = register_pernet_subsys(&nft_compat_net_ops); + if (ret < 0) + goto err_target; + ret = nft_register_expr(&nft_match_type); if (ret < 0) - return ret; + goto err_pernet; ret = nft_register_expr(&nft_target_type); if (ret < 0) @@ -942,45 +1015,21 @@ static int __init nft_compat_module_init(void) } return ret; - err_target: nft_unregister_expr(&nft_target_type); err_match: nft_unregister_expr(&nft_match_type); +err_pernet: + unregister_pernet_subsys(&nft_compat_net_ops); return ret; } static void __exit nft_compat_module_exit(void) { - struct nft_xt *xt, *next; - - /* list should be empty here, it can be non-empty only in case there - * was an error that caused nft_xt expr to not be initialized fully - * and noone else requested the same expression later. - * - * In this case, the lists contain 0-refcount entries that still - * hold module reference. - */ - list_for_each_entry_safe(xt, next, &nft_target_list, head) { - struct xt_target *target = xt->ops.data; - - if (WARN_ON_ONCE(refcount_read(&xt->refcnt))) - continue; - module_put(target->me); - kfree(xt); - } - - list_for_each_entry_safe(xt, next, &nft_match_list, head) { - struct xt_match *match = xt->ops.data; - - if (WARN_ON_ONCE(refcount_read(&xt->refcnt))) - continue; - module_put(match->me); - kfree(xt); - } nfnetlink_subsys_unregister(&nfnl_compat_subsys); nft_unregister_expr(&nft_target_type); nft_unregister_expr(&nft_match_type); + unregister_pernet_subsys(&nft_compat_net_ops); } MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT); -- cgit v1.2.3 From b2e3d68d1251a051a620f9086e18f7ffa6833b5b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Jan 2019 14:28:50 +0100 Subject: netfilter: nft_compat: destroy function must not have side effects The nft_compat destroy function deletes the nft_xt object from a list. This isn't allowed anymore. Destroy functions are called asynchronously, i.e. next batch can find the object that has a pending ->destroy() invocation: cpu0 cpu1 worker ->destroy for_each_entry() if (x == ... return x->ops; list_del(x) kfree_rcu(x) expr->ops->... // ops was free'd To resolve this, the list_del needs to occur before the transaction mutex gets released. nf_tables has a 'deactivate' hook for this purpose, so use that to unlink the object from the list. Fixes: 0935d5588400 ("netfilter: nf_tables: asynchronous release") Reported-by: Taehee Yoo Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 48 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index abed3490a8f8..5eb269428832 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -29,6 +29,9 @@ struct nft_xt { struct nft_expr_ops ops; refcount_t refcnt; + /* used only when transaction mutex is locked */ + unsigned int listcnt; + /* Unlike other expressions, ops doesn't have static storage duration. * nft core assumes they do. We use kfree_rcu so that nft core can * can check expr->ops->size even after nft_compat->destroy() frees @@ -61,7 +64,7 @@ static struct nft_compat_net *nft_compat_pernet(struct net *net) static bool nft_xt_put(struct nft_xt *xt) { if (refcount_dec_and_test(&xt->refcnt)) { - list_del(&xt->head); + WARN_ON_ONCE(!list_empty(&xt->head)); kfree_rcu(xt, rcu_head); return true; } @@ -555,6 +558,43 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) __nft_match_destroy(ctx, expr, nft_expr_priv(expr)); } +static void nft_compat_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + struct list_head *h) +{ + struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops); + + if (xt->listcnt == 0) + list_add(&xt->head, h); + + xt->listcnt++; +} + +static void nft_compat_activate_mt(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_compat_net *cn = nft_compat_pernet(ctx->net); + + nft_compat_activate(ctx, expr, &cn->nft_match_list); +} + +static void nft_compat_activate_tg(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_compat_net *cn = nft_compat_pernet(ctx->net); + + nft_compat_activate(ctx, expr, &cn->nft_target_list); +} + +static void nft_compat_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops); + + if (--xt->listcnt == 0) + list_del_init(&xt->head); +} + static void nft_match_large_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { @@ -808,6 +848,8 @@ nft_match_select_ops(const struct nft_ctx *ctx, nft_match->ops.eval = nft_match_eval; nft_match->ops.init = nft_match_init; nft_match->ops.destroy = nft_match_destroy; + nft_match->ops.activate = nft_compat_activate_mt; + nft_match->ops.deactivate = nft_compat_deactivate; nft_match->ops.dump = nft_match_dump; nft_match->ops.validate = nft_match_validate; nft_match->ops.data = match; @@ -824,6 +866,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, nft_match->ops.size = matchsize; + nft_match->listcnt = 1; list_add(&nft_match->head, &cn->nft_match_list); return &nft_match->ops; @@ -910,6 +953,8 @@ nft_target_select_ops(const struct nft_ctx *ctx, nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); nft_target->ops.init = nft_target_init; nft_target->ops.destroy = nft_target_destroy; + nft_target->ops.activate = nft_compat_activate_tg; + nft_target->ops.deactivate = nft_compat_deactivate; nft_target->ops.dump = nft_target_dump; nft_target->ops.validate = nft_target_validate; nft_target->ops.data = target; @@ -919,6 +964,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, else nft_target->ops.eval = nft_target_eval_xt; + nft_target->listcnt = 1; list_add(&nft_target->head, &cn->nft_target_list); return &nft_target->ops; -- cgit v1.2.3 From 28c1382fa28f2e2d9d0d6f25ae879b5af2ecbd03 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Thu, 17 Jan 2019 09:46:41 +0800 Subject: net: bridge: Fix ethernet header pointer before check skb forwardable The skb header should be set to ethernet header before using is_skb_forwardable. Because the ethernet header length has been considered in is_skb_forwardable(including dev->hard_header_len length). To reproduce the issue: 1, add 2 ports on linux bridge br using following commands: $ brctl addbr br $ brctl addif br eth0 $ brctl addif br eth1 2, the MTU of eth0 and eth1 is 1500 3, send a packet(Data 1480, UDP 8, IP 20, Ethernet 14, VLAN 4) from eth0 to eth1 So the expect result is packet larger than 1500 cannot pass through eth0 and eth1. But currently, the packet passes through success, it means eth1's MTU limit doesn't take effect. Fixes: f6367b4660dd ("bridge: use is_skb_forwardable in forward path") Cc: bridge@lists.linux-foundation.org Cc: Nkolay Aleksandrov Cc: Roopa Prabhu Cc: Stephen Hemminger Signed-off-by: Yunjian Wang Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 2cb8da465b98..48ddc60b4fbd 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -36,10 +36,10 @@ static inline int should_deliver(const struct net_bridge_port *p, int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { + skb_push(skb, ETH_HLEN); if (!is_skb_forwardable(skb->dev, skb)) goto drop; - skb_push(skb, ETH_HLEN); br_drop_fake_rtable(skb); if (skb->ip_summed == CHECKSUM_PARTIAL && @@ -98,12 +98,11 @@ static void __br_forward(const struct net_bridge_port *to, net = dev_net(indev); } else { if (unlikely(netpoll_tx_running(to->br->dev))) { - if (!is_skb_forwardable(skb->dev, skb)) { + skb_push(skb, ETH_HLEN); + if (!is_skb_forwardable(skb->dev, skb)) kfree_skb(skb); - } else { - skb_push(skb, ETH_HLEN); + else br_netpoll_send_skb(to, skb); - } return; } br_hook = NF_BR_LOCAL_OUT; -- cgit v1.2.3 From 87fff3cacd0112bcaf42f932c1e44ae32b42f1fb Mon Sep 17 00:00:00 2001 From: Yang Wei Date: Thu, 17 Jan 2019 23:11:30 +0800 Subject: neighbour: Do not perturb drop profiles when neigh_probe Replace the kfree_skb() by consume_skb() to be drop monitor(dropwatch, perf) friendly. Signed-off-by: Yang Wei Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 96fdc9134726..4230400b9a30 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1007,7 +1007,7 @@ static void neigh_probe(struct neighbour *neigh) if (neigh->ops->solicit) neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); - kfree_skb(skb); + consume_skb(skb); } /* Called when a timer expires for a neighbour entry. */ -- cgit v1.2.3 From 6c57f0458022298e4da1729c67bd33ce41c14e7a Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Thu, 17 Jan 2019 15:34:38 +0000 Subject: net: Fix usage of pskb_trim_rcsum In certain cases, pskb_trim_rcsum() may change skb pointers. Reinitialize header pointers afterwards to avoid potential use-after-frees. Add a note in the documentation of pskb_trim_rcsum(). Found by KASAN. Signed-off-by: Ross Lagerwall Signed-off-by: David S. Miller --- drivers/net/ppp/pppoe.c | 1 + include/linux/skbuff.h | 1 + net/bridge/br_netfilter_ipv6.c | 1 + net/bridge/netfilter/nft_reject_bridge.c | 1 + net/ipv4/ip_input.c | 1 + 5 files changed, 5 insertions(+) (limited to 'net') diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 62dc564b251d..f22639f0116a 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -445,6 +445,7 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, if (pskb_trim_rcsum(skb, len)) goto drop; + ph = pppoe_hdr(skb); pn = pppoe_pernet(dev_net(dev)); /* Note that get_item does a sock_hold(), so sk_pppox(po) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 93f56fddd92a..95d25b010a25 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3218,6 +3218,7 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); * * This is exactly the same as pskb_trim except that it ensures the * checksum of received packets are still valid after the operation. + * It can change skb pointers. */ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 94039f588f1d..564710f88f93 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -131,6 +131,7 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb) IPSTATS_MIB_INDISCARDS); goto drop; } + hdr = ipv6_hdr(skb); } if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb)) goto drop; diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 08cbed7d940e..419e8edf23ba 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -229,6 +229,7 @@ static bool reject6_br_csum_ok(struct sk_buff *skb, int hook) pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h))) return false; + ip6h = ipv6_hdr(skb); thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo); if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) return false; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 26921f6b3b92..51d8efba6de2 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -488,6 +488,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) goto drop; } + iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ -- cgit v1.2.3 From 710ae72877378e7cde611efd30fe90502a6e5b30 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 18 Jan 2019 15:58:00 +0000 Subject: net: bridge: Mark FDB entries that were added by user as such Externally learned entries can be added by a user or by a switch driver that is notifying the bridge driver about entries that were learned in hardware. In the first case, the entries are not marked with the 'added_by_user' flag, which causes switch drivers to ignore them and not offload them. The 'added_by_user' flag can be set on externally learned FDB entries based on the 'swdev_notify' parameter in br_fdb_external_learn_add(), which effectively means if the created / updated FDB entry was added by a user or not. Fixes: 816a3bed9549 ("switchdev: Add fdb.added_by_user to switchdev notifications") Signed-off-by: Ido Schimmel Reported-by: Alexander Petrovskiy Reviewed-by: Petr Machata Cc: Roopa Prabhu Cc: Nikolay Aleksandrov Cc: bridge@lists.linux-foundation.org Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index fe3c758791ca..9e14767500ea 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1128,6 +1128,8 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, err = -ENOMEM; goto err_unlock; } + if (swdev_notify) + fdb->added_by_user = 1; fdb->added_by_external_learn = 1; fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } else { @@ -1147,6 +1149,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, modified = true; } + if (swdev_notify) + fdb->added_by_user = 1; + if (modified) fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } -- cgit v1.2.3 From a0dc02039a2ee54fb4ae400e0b755ed30e73e58c Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Thu, 17 Jan 2019 16:32:42 -0500 Subject: mac80211: fix miscounting of ttl-dropped frames In ieee80211_rx_h_mesh_fwding, we increment the 'dropped_frames_ttl' counter when we decrement the ttl to zero. For unicast frames destined for other hosts, we stop processing the frame at that point. For multicast frames, we do not rebroadcast it in this case, but we do pass the frame up the stack to process it on this STA. That doesn't match the usual definition of "dropped," so don't count those as such. With this change, something like `ping6 -i0.2 ff02::1%mesh0` from a peer in a ttl=1 network no longer increments the counter rapidly. Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 45aad3d3108c..27a337bc8acf 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2723,7 +2723,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) skb_set_queue_mapping(skb, q); if (!--mesh_hdr->ttl) { - IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl); + if (!is_multicast_ether_addr(hdr->addr1)) + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, + dropped_frames_ttl); goto out; } -- cgit v1.2.3 From e7c87bd6cc4ec7b0ac1ed0a88a58f8206c577488 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 15 Jan 2019 20:19:22 -0500 Subject: bpf: in __bpf_redirect_no_mac pull mac only if present Syzkaller was able to construct a packet of negative length by redirecting from bpf_prog_test_run_skb with BPF_PROG_TYPE_LWT_XMIT: BUG: KASAN: slab-out-of-bounds in memcpy include/linux/string.h:345 [inline] BUG: KASAN: slab-out-of-bounds in skb_copy_from_linear_data include/linux/skbuff.h:3421 [inline] BUG: KASAN: slab-out-of-bounds in __pskb_copy_fclone+0x2dd/0xeb0 net/core/skbuff.c:1395 Read of size 4294967282 at addr ffff8801d798009c by task syz-executor2/12942 kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 memcpy+0x23/0x50 mm/kasan/kasan.c:302 memcpy include/linux/string.h:345 [inline] skb_copy_from_linear_data include/linux/skbuff.h:3421 [inline] __pskb_copy_fclone+0x2dd/0xeb0 net/core/skbuff.c:1395 __pskb_copy include/linux/skbuff.h:1053 [inline] pskb_copy include/linux/skbuff.h:2904 [inline] skb_realloc_headroom+0xe7/0x120 net/core/skbuff.c:1539 ipip6_tunnel_xmit net/ipv6/sit.c:965 [inline] sit_tunnel_xmit+0xe1b/0x30d0 net/ipv6/sit.c:1029 __netdev_start_xmit include/linux/netdevice.h:4325 [inline] netdev_start_xmit include/linux/netdevice.h:4334 [inline] xmit_one net/core/dev.c:3219 [inline] dev_hard_start_xmit+0x295/0xc90 net/core/dev.c:3235 __dev_queue_xmit+0x2f0d/0x3950 net/core/dev.c:3805 dev_queue_xmit+0x17/0x20 net/core/dev.c:3838 __bpf_tx_skb net/core/filter.c:2016 [inline] __bpf_redirect_common net/core/filter.c:2054 [inline] __bpf_redirect+0x5cf/0xb20 net/core/filter.c:2061 ____bpf_clone_redirect net/core/filter.c:2094 [inline] bpf_clone_redirect+0x2f6/0x490 net/core/filter.c:2066 bpf_prog_41f2bcae09cd4ac3+0xb25/0x1000 The generated test constructs a packet with mac header, network header, skb->data pointing to network header and skb->len 0. Redirecting to a sit0 through __bpf_redirect_no_mac pulls the mac length, even though skb->data already is at skb->network_header. bpf_prog_test_run_skb has already pulled it as LWT_XMIT !is_l2. Update the offset calculation to pull only if skb->data differs from skb->network_header, which is not true in this case. The test itself can be run only from commit 1cf1cae963c2 ("bpf: introduce BPF_PROG_TEST_RUN command"), but the same type of packets with skb at network header could already be built from lwt xmit hooks, so this fix is more relevant to that commit. Also set the mac header on redirect from LWT_XMIT, as even after this change to __bpf_redirect_no_mac that field is expected to be set, but is not yet in ip_finish_output2. Fixes: 3a0af8fd61f9 ("bpf: BPF for lightweight tunnel infrastructure") Reported-by: syzbot Signed-off-by: Willem de Bruijn Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- net/core/filter.c | 21 +++++++++++---------- net/core/lwt_bpf.c | 1 + 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index dab10d21cae8..7559d6835ecb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2020,18 +2020,19 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, u32 flags) { - /* skb->mac_len is not set on normal egress */ - unsigned int mlen = skb->network_header - skb->mac_header; + unsigned int mlen = skb_network_offset(skb); - __skb_pull(skb, mlen); + if (mlen) { + __skb_pull(skb, mlen); - /* At ingress, the mac header has already been pulled once. - * At egress, skb_pospull_rcsum has to be done in case that - * the skb is originated from ingress (i.e. a forwarded skb) - * to ensure that rcsum starts at net header. - */ - if (!skb_at_tc_ingress(skb)) - skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); + /* At ingress, the mac header has already been pulled once. + * At egress, skb_pospull_rcsum has to be done in case that + * the skb is originated from ingress (i.e. a forwarded skb) + * to ensure that rcsum starts at net header. + */ + if (!skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); + } skb_pop_mac_header(skb); skb_reset_mac_len(skb); return flags & BPF_F_INGRESS ? diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 3e85437f7106..a648568c5e8f 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -63,6 +63,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, lwt->name ? : ""); ret = BPF_OK; } else { + skb_reset_mac_header(skb); ret = skb_do_redirect(skb); if (ret == 0) ret = BPF_REDIRECT; -- cgit v1.2.3 From 4aac9228d16458cedcfd90c7fb37211cf3653ac3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 14 Jan 2019 21:13:10 +0100 Subject: libceph: avoid KEEPALIVE_PENDING races in ceph_con_keepalive() con_fault() can transition the connection into STANDBY right after ceph_con_keepalive() clears STANDBY in clear_standby(): libceph user thread ceph-msgr worker ceph_con_keepalive() mutex_lock(&con->mutex) clear_standby(con) mutex_unlock(&con->mutex) mutex_lock(&con->mutex) con_fault() ... if KEEPALIVE_PENDING isn't set set state to STANDBY ... mutex_unlock(&con->mutex) set KEEPALIVE_PENDING set WRITE_PENDING This triggers warnings in clear_standby() when either ceph_con_send() or ceph_con_keepalive() get to clearing STANDBY next time. I don't see a reason to condition queue_con() call on the previous value of KEEPALIVE_PENDING, so move the setting of KEEPALIVE_PENDING into the critical section -- unlike WRITE_PENDING, KEEPALIVE_PENDING could have been a non-atomic flag. Reported-by: syzbot+acdeb633f6211ccdf886@syzkaller.appspotmail.com Signed-off-by: Ilya Dryomov Tested-by: Myungho Jung --- net/ceph/messenger.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d5718284db57..3661cdd927f1 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3206,9 +3206,10 @@ void ceph_con_keepalive(struct ceph_connection *con) dout("con_keepalive %p\n", con); mutex_lock(&con->mutex); clear_standby(con); + con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING); mutex_unlock(&con->mutex); - if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 && - con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) + + if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); -- cgit v1.2.3 From 93171ba6f1deffd82f381d36cb13177872d023f6 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Sun, 13 Jan 2019 19:31:43 +0100 Subject: can: bcm: check timer values before ktime conversion Kyungtae Kim detected a potential integer overflow in bcm_[rx|tx]_setup() when the conversion into ktime multiplies the given value with NSEC_PER_USEC (1000). Reference: https://marc.info/?l=linux-can&m=154732118819828&w=2 Add a check for the given tv_usec, so that the value stays below one second. Additionally limit the tv_sec value to a reasonable value for CAN related use-cases of 400 days and ensure all values to be positive. Reported-by: Kyungtae Kim Tested-by: Oliver Hartkopp Signed-off-by: Oliver Hartkopp Cc: linux-stable # >= 2.6.26 Tested-by: Kyungtae Kim Acked-by: Andre Naujoks Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index 0af8f0db892a..79bb8afa9c0c 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -67,6 +67,9 @@ */ #define MAX_NFRAMES 256 +/* limit timers to 400 days for sending/timeouts */ +#define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60) + /* use of last_frames[index].flags */ #define RX_RECV 0x40 /* received data for this element */ #define RX_THR 0x80 /* element not been sent due to throttle feature */ @@ -140,6 +143,22 @@ static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv) return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); } +/* check limitations for timeval provided by user */ +static bool bcm_is_invalid_tv(struct bcm_msg_head *msg_head) +{ + if ((msg_head->ival1.tv_sec < 0) || + (msg_head->ival1.tv_sec > BCM_TIMER_SEC_MAX) || + (msg_head->ival1.tv_usec < 0) || + (msg_head->ival1.tv_usec >= USEC_PER_SEC) || + (msg_head->ival2.tv_sec < 0) || + (msg_head->ival2.tv_sec > BCM_TIMER_SEC_MAX) || + (msg_head->ival2.tv_usec < 0) || + (msg_head->ival2.tv_usec >= USEC_PER_SEC)) + return true; + + return false; +} + #define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU) #define OPSIZ sizeof(struct bcm_op) #define MHSIZ sizeof(struct bcm_msg_head) @@ -873,6 +892,10 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES) return -EINVAL; + /* check timeval limitations */ + if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) + return -EINVAL; + /* check the given can_id */ op = bcm_find_op(&bo->tx_ops, msg_head, ifindex); if (op) { @@ -1053,6 +1076,10 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, (!(msg_head->can_id & CAN_RTR_FLAG)))) return -EINVAL; + /* check timeval limitations */ + if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) + return -EINVAL; + /* check the given can_id */ op = bcm_find_op(&bo->rx_ops, msg_head, ifindex); if (op) { -- cgit v1.2.3 From cb73ee40b1b381eaf3749e6dbeed567bb38e5258 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 18 Jan 2019 12:05:39 +0100 Subject: net: ip_gre: use erspan key field for tunnel lookup Use ERSPAN key header field as tunnel key in gre_parse_header routine since ERSPAN protocol sets the key field of the external GRE header to 0 resulting in a tunnel lookup fail in ip6gre_err. In addition remove key field parsing and pskb_may_pull check in erspan_rcv and ip6erspan_rcv Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/ipv4/gre_demux.c | 17 +++++++++++++++++ net/ipv4/ip_gre.c | 9 --------- net/ipv6/ip6_gre.c | 4 ---- 3 files changed, 17 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index a4bf22ee3aed..7c4a41dc04bb 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -119,6 +120,22 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, hdr_len += 4; } tpi->hdr_len = hdr_len; + + /* ERSPAN ver 1 and 2 protocol sets GRE key field + * to 0 and sets the configured key in the + * inner erspan header field + */ + if (greh->protocol == htons(ETH_P_ERSPAN) || + greh->protocol == htons(ETH_P_ERSPAN2)) { + struct erspan_base_hdr *ershdr; + + if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr))) + return -EINVAL; + + ershdr = (struct erspan_base_hdr *)options; + tpi->key = cpu_to_be32(get_session_id(ershdr)); + } + return hdr_len; } EXPORT_SYMBOL(gre_parse_header); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index b1a74d80d868..20a64fe6254b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -268,20 +268,11 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int len; itn = net_generic(net, erspan_net_id); - len = gre_hdr_len + sizeof(*ershdr); - - /* Check based hdr len */ - if (unlikely(!pskb_may_pull(skb, len))) - return PACKET_REJECT; iph = ip_hdr(skb); ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; - /* The original GRE header does not have key field, - * Use ERSPAN 10-bit session ID as key. - */ - tpi->key = cpu_to_be32(get_session_id(ershdr)); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b1be67ca6768..4416368dbd49 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -534,13 +534,9 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, struct ip6_tnl *tunnel; u8 ver; - if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) - return PACKET_REJECT; - ipv6h = ipv6_hdr(skb); ershdr = (struct erspan_base_hdr *)skb->data; ver = ershdr->ver; - tpi->key = cpu_to_be32(get_session_id(ershdr)); tunnel = ip6gre_tunnel_lookup(skb->dev, &ipv6h->saddr, &ipv6h->daddr, tpi->key, -- cgit v1.2.3 From 1518039f6b5ac794313c24c76f85cead0cd60f6c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Jan 2019 14:47:19 -0800 Subject: net/ipv6: don't return positive numbers when nothing was dumped in6_dump_addrs() returns a positive 1 if there was nothing to dump. This return value can not be passed as return from inet6_dump_addr() as is, because it will confuse rtnetlink, resulting in NLMSG_DONE never getting set: $ ip addr list dev lo EOF on netlink Dump terminated v2: flip condition to avoid a new goto (DaveA) Fixes: 7c1e8a3817c5 ("netlink: fixup regression in RTM_GETADDR") Reported-by: Brendan Galloway Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 93d5ad2b1a69..0c9e20ac01ab 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5120,6 +5120,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, if (idev) { err = in6_dump_addrs(idev, skb, cb, s_ip_idx, &fillargs); + if (err > 0) + err = 0; } goto put_tgt_net; } -- cgit v1.2.3 From 7c62b8dd5ca89dabd9f455d19e663bad60951bd5 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 21 Jan 2019 14:54:20 +0100 Subject: net/ipv6: lower the level of "link is not ready" messages This message gets logged far too often for how interesting is it. Most distributions nowadays configure NetworkManager to use randomly generated MAC addresses for Wi-Fi network scans. The interfaces end up being periodically brought down for the address change. When they're subsequently brought back up, the message is logged, eventually flooding the log. Perhaps the message is not all that helpful: it seems to be more interesting to hear when the addrconf actually start, not when it does not. Let's lower its level. Signed-off-by: Lubomir Rintel Acked-By: Thomas Haller Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0c9e20ac01ab..84c358804355 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3495,8 +3495,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (!addrconf_link_ready(dev)) { /* device is not ready yet. */ - pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", - dev->name); + pr_debug("ADDRCONF(NETDEV_UP): %s: link is not ready\n", + dev->name); break; } -- cgit v1.2.3 From c9e4576743eeda8d24dedc164d65b78877f9a98c Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 23 Jan 2019 12:37:19 +0800 Subject: bpf: sock recvbuff must be limited by rmem_max in bpf_setsockopt() When sock recvbuff is set by bpf_setsockopt(), the value must by limited by rmem_max. It is the same with sendbuff. Fixes: 8c4b4c7e9ff0 ("bpf: Add setsockopt helper function to bpf") Signed-off-by: Yafang Shao Acked-by: Martin KaFai Lau Acked-by: Lawrence Brakmo Signed-off-by: Daniel Borkmann --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 7559d6835ecb..7a54dc11ac2d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4112,10 +4112,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, /* Only some socketops are supported */ switch (optname) { case SO_RCVBUF: + val = min_t(u32, val, sysctl_rmem_max); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); break; case SO_SNDBUF: + val = min_t(u32, val, sysctl_wmem_max); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); break; -- cgit v1.2.3 From 63530aba7826a0f8e129874df9c4d264f9db3f9e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Jan 2019 10:40:59 -0800 Subject: ax25: fix possible use-after-free syzbot found that ax25 routes where not properly protected against concurrent use [1]. In this particular report the bug happened while copying ax25->digipeat. Fix this problem by making sure we call ax25_get_route() while ax25_route_lock is held, so that no modification could happen while using the route. The current two ax25_get_route() callers do not sleep, so this change should be fine. Once we do that, ax25_get_route() no longer needs to grab a reference on the found route. [1] ax25_connect(): syz-executor0 uses autobind, please contact jreuter@yaina.de BUG: KASAN: use-after-free in memcpy include/linux/string.h:352 [inline] BUG: KASAN: use-after-free in kmemdup+0x42/0x60 mm/util.c:113 Read of size 66 at addr ffff888066641a80 by task syz-executor2/531 ax25_connect(): syz-executor0 uses autobind, please contact jreuter@yaina.de CPU: 1 PID: 531 Comm: syz-executor2 Not tainted 5.0.0-rc2+ #10 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1db/0x2d0 lib/dump_stack.c:113 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:187 kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317 check_memory_region_inline mm/kasan/generic.c:185 [inline] check_memory_region+0x123/0x190 mm/kasan/generic.c:191 memcpy+0x24/0x50 mm/kasan/common.c:130 memcpy include/linux/string.h:352 [inline] kmemdup+0x42/0x60 mm/util.c:113 kmemdup include/linux/string.h:425 [inline] ax25_rt_autobind+0x25d/0x750 net/ax25/ax25_route.c:424 ax25_connect.cold+0x30/0xa4 net/ax25/af_ax25.c:1224 __sys_connect+0x357/0x490 net/socket.c:1664 __do_sys_connect net/socket.c:1675 [inline] __se_sys_connect net/socket.c:1672 [inline] __x64_sys_connect+0x73/0xb0 net/socket.c:1672 do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x458099 Code: 6d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 3b b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f870ee22c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000458099 RDX: 0000000000000048 RSI: 0000000020000080 RDI: 0000000000000005 RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000 ax25_connect(): syz-executor4 uses autobind, please contact jreuter@yaina.de R10: 0000000000000000 R11: 0000000000000246 R12: 00007f870ee236d4 R13: 00000000004be48e R14: 00000000004ce9a8 R15: 00000000ffffffff Allocated by task 526: save_stack+0x45/0xd0 mm/kasan/common.c:73 set_track mm/kasan/common.c:85 [inline] __kasan_kmalloc mm/kasan/common.c:496 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:469 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:504 ax25_connect(): syz-executor5 uses autobind, please contact jreuter@yaina.de kmem_cache_alloc_trace+0x151/0x760 mm/slab.c:3609 kmalloc include/linux/slab.h:545 [inline] ax25_rt_add net/ax25/ax25_route.c:95 [inline] ax25_rt_ioctl+0x3b9/0x1270 net/ax25/ax25_route.c:233 ax25_ioctl+0x322/0x10b0 net/ax25/af_ax25.c:1763 sock_do_ioctl+0xe2/0x400 net/socket.c:950 sock_ioctl+0x32f/0x6c0 net/socket.c:1074 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:509 [inline] do_vfs_ioctl+0x107b/0x17d0 fs/ioctl.c:696 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713 __do_sys_ioctl fs/ioctl.c:720 [inline] __se_sys_ioctl fs/ioctl.c:718 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718 do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe ax25_connect(): syz-executor5 uses autobind, please contact jreuter@yaina.de Freed by task 550: save_stack+0x45/0xd0 mm/kasan/common.c:73 set_track mm/kasan/common.c:85 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:458 kasan_slab_free+0xe/0x10 mm/kasan/common.c:466 __cache_free mm/slab.c:3487 [inline] kfree+0xcf/0x230 mm/slab.c:3806 ax25_rt_add net/ax25/ax25_route.c:92 [inline] ax25_rt_ioctl+0x304/0x1270 net/ax25/ax25_route.c:233 ax25_ioctl+0x322/0x10b0 net/ax25/af_ax25.c:1763 sock_do_ioctl+0xe2/0x400 net/socket.c:950 sock_ioctl+0x32f/0x6c0 net/socket.c:1074 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:509 [inline] do_vfs_ioctl+0x107b/0x17d0 fs/ioctl.c:696 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713 __do_sys_ioctl fs/ioctl.c:720 [inline] __se_sys_ioctl fs/ioctl.c:718 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718 do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff888066641a80 which belongs to the cache kmalloc-96 of size 96 The buggy address is located 0 bytes inside of 96-byte region [ffff888066641a80, ffff888066641ae0) The buggy address belongs to the page: page:ffffea0001999040 count:1 mapcount:0 mapping:ffff88812c3f04c0 index:0x0 flags: 0x1fffc0000000200(slab) ax25_connect(): syz-executor4 uses autobind, please contact jreuter@yaina.de raw: 01fffc0000000200 ffffea0001817948 ffffea0002341dc8 ffff88812c3f04c0 raw: 0000000000000000 ffff888066641000 0000000100000020 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888066641980: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff888066641a00: 00 00 00 00 00 00 00 00 02 fc fc fc fc fc fc fc >ffff888066641a80: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ^ ffff888066641b00: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff888066641b80: 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc Signed-off-by: Eric Dumazet Cc: Ralf Baechle Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/ax25.h | 12 ++++++++++++ net/ax25/ax25_ip.c | 4 ++-- net/ax25/ax25_route.c | 19 ++++++++----------- 3 files changed, 22 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/ax25.h b/include/net/ax25.h index 3f9aea8087e3..8b7eb46ad72d 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -201,6 +201,18 @@ static inline void ax25_hold_route(ax25_route *ax25_rt) void __ax25_put_route(ax25_route *ax25_rt); +extern rwlock_t ax25_route_lock; + +static inline void ax25_route_lock_use(void) +{ + read_lock(&ax25_route_lock); +} + +static inline void ax25_route_lock_unuse(void) +{ + read_unlock(&ax25_route_lock); +} + static inline void ax25_put_route(ax25_route *ax25_rt) { if (refcount_dec_and_test(&ax25_rt->refcount)) diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index 70417e9b932d..314bbc8010fb 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -114,6 +114,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) dst = (ax25_address *)(bp + 1); src = (ax25_address *)(bp + 8); + ax25_route_lock_use(); route = ax25_get_route(dst, NULL); if (route) { digipeat = route->digipeat; @@ -206,9 +207,8 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) ax25_queue_xmit(skb, dev); put: - if (route) - ax25_put_route(route); + ax25_route_lock_unuse(); return NETDEV_TX_OK; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index a0eff323af12..66f74c85cf6b 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -40,7 +40,7 @@ #include static ax25_route *ax25_route_list; -static DEFINE_RWLOCK(ax25_route_lock); +DEFINE_RWLOCK(ax25_route_lock); void ax25_rt_device_down(struct net_device *dev) { @@ -335,6 +335,7 @@ const struct seq_operations ax25_rt_seqops = { * Find AX.25 route * * Only routes with a reference count of zero can be destroyed. + * Must be called with ax25_route_lock read locked. */ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev) { @@ -342,7 +343,6 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev) ax25_route *ax25_def_rt = NULL; ax25_route *ax25_rt; - read_lock(&ax25_route_lock); /* * Bind to the physical interface we heard them on, or the default * route if none is found; @@ -365,11 +365,6 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev) if (ax25_spe_rt != NULL) ax25_rt = ax25_spe_rt; - if (ax25_rt != NULL) - ax25_hold_route(ax25_rt); - - read_unlock(&ax25_route_lock); - return ax25_rt; } @@ -400,9 +395,12 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) ax25_route *ax25_rt; int err = 0; - if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL) + ax25_route_lock_use(); + ax25_rt = ax25_get_route(addr, NULL); + if (!ax25_rt) { + ax25_route_lock_unuse(); return -EHOSTUNREACH; - + } if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) { err = -EHOSTUNREACH; goto put; @@ -437,8 +435,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) } put: - ax25_put_route(ax25_rt); - + ax25_route_lock_unuse(); return err; } -- cgit v1.2.3 From 53ab60baa1ac4f20b080a22c13b77b6373922fd7 Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Thu, 10 Jan 2019 16:39:06 +0800 Subject: ipvs: Fix signed integer overflow when setsockopt timeout There is a UBSAN bug report as below: UBSAN: Undefined behaviour in net/netfilter/ipvs/ip_vs_ctl.c:2227:21 signed integer overflow: -2147483647 * 1000 cannot be represented in type 'int' Reproduce program: #include #include #include #define IPPROTO_IP 0 #define IPPROTO_RAW 255 #define IP_VS_BASE_CTL (64+1024+64) #define IP_VS_SO_SET_TIMEOUT (IP_VS_BASE_CTL+10) /* The argument to IP_VS_SO_GET_TIMEOUT */ struct ipvs_timeout_t { int tcp_timeout; int tcp_fin_timeout; int udp_timeout; }; int main() { int ret = -1; int sockfd = -1; struct ipvs_timeout_t to; sockfd = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (sockfd == -1) { printf("socket init error\n"); return -1; } to.tcp_timeout = -2147483647; to.tcp_fin_timeout = -2147483647; to.udp_timeout = -2147483647; ret = setsockopt(sockfd, IPPROTO_IP, IP_VS_SO_SET_TIMEOUT, (char *)(&to), sizeof(to)); printf("setsockopt return %d\n", ret); return ret; } Return -EINVAL if the timeout value is negative or max than 'INT_MAX / HZ'. Signed-off-by: ZhangXiaoxu Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 432141f04af3..7d6318664eb2 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2220,6 +2220,18 @@ static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user u->tcp_fin_timeout, u->udp_timeout); +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) || + u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) { + return -EINVAL; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ)) + return -EINVAL; +#endif + #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); -- cgit v1.2.3 From d71b57532d70c03f4671dd04e84157ac6bf021b0 Mon Sep 17 00:00:00 2001 From: wenxu Date: Sat, 19 Jan 2019 13:11:25 +0800 Subject: ip_tunnel: Make none-tunnel-dst tunnel port work with lwtunnel ip l add dev tun type gretap key 1000 ip a a dev tun 10.0.0.1/24 Packets with tun-id 1000 can be recived by tun dev. But packet can't be sent through dev tun for non-tunnel-dst With this patch: tunnel-dst can be get through lwtunnel like beflow: ip r a 10.0.0.7 encap ip dst 172.168.0.11 dev tun Signed-off-by: wenxu Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c4f5602308ed..054d01c16dc6 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -644,13 +644,19 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ + struct ip_tunnel_info *tun_info; if (!skb_dst(skb)) { dev->stats.tx_fifo_errors++; goto tx_error; } - if (skb->protocol == htons(ETH_P_IP)) { + tun_info = skb_tunnel_info(skb); + if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && + ip_tunnel_info_af(tun_info) == AF_INET && + tun_info->key.u.ipv4.dst) + dst = tun_info->key.u.ipv4.dst; + else if (skb->protocol == htons(ETH_P_IP)) { rt = skb_rtable(skb); dst = rt_nexthop(rt, inner_iph->daddr); } -- cgit v1.2.3 From 2e6dc4d95110becfe0ff4c3d4749c33ea166e9e7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jan 2019 02:39:34 +0800 Subject: sctp: improve the events for sctp stream reset This patch is to improve sctp stream reset events in 4 places: 1. In sctp_process_strreset_outreq(), the flag should always be set with SCTP_STREAM_RESET_INCOMING_SSN instead of OUTGOING, as receiver's in stream is reset here. 2. In sctp_process_strreset_outreq(), move up SCTP_STRRESET_ERR_WRONG_SSN check, as the reset has to succeed after reconf_timer stops for the in stream reset request retransmission. 3. In sctp_process_strreset_inreq(), no event should be sent, as no in or out stream is reset here. 4. In sctp_process_strreset_resp(), SCTP_STREAM_RESET_INCOMING_SSN or OUTGOING event should always be sent for stream reset requests, no matter it fails or succeeds to process the request. Fixes: 810544764536 ("sctp: implement receiver-side procedures for the Outgoing SSN Reset Request Parameter") Fixes: 16e1a91965b0 ("sctp: implement receiver-side procedures for the Incoming SSN Reset Request Parameter") Fixes: 11ae76e67a17 ("sctp: implement receiver-side procedures for the Reconf Response Parameter") Reported-by: Ying Xu Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/stream.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 3892e7630f3a..6c188b06e5e1 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -585,9 +585,9 @@ struct sctp_chunk *sctp_process_strreset_outreq( struct sctp_strreset_outreq *outreq = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; - __u16 i, nums, flags = 0; __be16 *str_p = NULL; __u32 request_seq; + __u16 i, nums; request_seq = ntohl(outreq->request_seq); @@ -615,6 +615,15 @@ struct sctp_chunk *sctp_process_strreset_outreq( if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; + nums = (ntohs(param.p->length) - sizeof(*outreq)) / sizeof(__u16); + str_p = outreq->list_of_streams; + for (i = 0; i < nums; i++) { + if (ntohs(str_p[i]) >= stream->incnt) { + result = SCTP_STRRESET_ERR_WRONG_SSN; + goto out; + } + } + if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, outreq->response_seq, @@ -637,32 +646,19 @@ struct sctp_chunk *sctp_process_strreset_outreq( sctp_chunk_put(asoc->strreset_chunk); asoc->strreset_chunk = NULL; } - - flags = SCTP_STREAM_RESET_INCOMING_SSN; } - nums = (ntohs(param.p->length) - sizeof(*outreq)) / sizeof(__u16); - if (nums) { - str_p = outreq->list_of_streams; - for (i = 0; i < nums; i++) { - if (ntohs(str_p[i]) >= stream->incnt) { - result = SCTP_STRRESET_ERR_WRONG_SSN; - goto out; - } - } - + if (nums) for (i = 0; i < nums; i++) SCTP_SI(stream, ntohs(str_p[i]))->mid = 0; - } else { + else for (i = 0; i < stream->incnt; i++) SCTP_SI(stream, i)->mid = 0; - } result = SCTP_STRRESET_PERFORMED; *evp = sctp_ulpevent_make_stream_reset_event(asoc, - flags | SCTP_STREAM_RESET_OUTGOING_SSN, nums, str_p, - GFP_ATOMIC); + SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC); out: sctp_update_strreset_result(asoc, result); @@ -738,9 +734,6 @@ struct sctp_chunk *sctp_process_strreset_inreq( result = SCTP_STRRESET_PERFORMED; - *evp = sctp_ulpevent_make_stream_reset_event(asoc, - SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC); - out: sctp_update_strreset_result(asoc, result); err: @@ -1036,10 +1029,10 @@ struct sctp_chunk *sctp_process_strreset_resp( sout->mid_uo = 0; } } - - flags = SCTP_STREAM_RESET_OUTGOING_SSN; } + flags |= SCTP_STREAM_RESET_OUTGOING_SSN; + for (i = 0; i < stream->outcnt; i++) SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; @@ -1058,6 +1051,8 @@ struct sctp_chunk *sctp_process_strreset_resp( nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / sizeof(__u16); + flags |= SCTP_STREAM_RESET_INCOMING_SSN; + *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_TSN_REQUEST) { -- cgit v1.2.3 From 8220c870cb0f4eaa4e335c9645dbd9a1c461c1dd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jan 2019 02:40:12 +0800 Subject: sctp: improve the events for sctp stream adding This patch is to improve sctp stream adding events in 2 places: 1. In sctp_process_strreset_addstrm_out(), move up SCTP_MAX_STREAM and in stream allocation failure checks, as the adding has to succeed after reconf_timer stops for the in stream adding request retransmission. 3. In sctp_process_strreset_addstrm_in(), no event should be sent, as no in or out stream is added here. Fixes: 50a41591f110 ("sctp: implement receiver-side procedures for the Add Outgoing Streams Request Parameter") Fixes: c5c4ebb3ab87 ("sctp: implement receiver-side procedures for the Add Incoming Streams Request Parameter") Reported-by: Ying Xu Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/stream.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 6c188b06e5e1..80e0ae5534ec 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -866,6 +866,14 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) goto out; + in = ntohs(addstrm->number_of_streams); + incnt = stream->incnt + in; + if (!in || incnt > SCTP_MAX_STREAM) + goto out; + + if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC)) + goto out; + if (asoc->strreset_chunk) { if (!sctp_chunk_lookup_strreset_param( asoc, 0, SCTP_PARAM_RESET_ADD_IN_STREAMS)) { @@ -889,14 +897,6 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( } } - in = ntohs(addstrm->number_of_streams); - incnt = stream->incnt + in; - if (!in || incnt > SCTP_MAX_STREAM) - goto out; - - if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC)) - goto out; - stream->incnt = incnt; result = SCTP_STRRESET_PERFORMED; @@ -966,9 +966,6 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in( result = SCTP_STRRESET_PERFORMED; - *evp = sctp_ulpevent_make_stream_change_event(asoc, - 0, 0, ntohs(addstrm->number_of_streams), GFP_ATOMIC); - out: sctp_update_strreset_result(asoc, result); err: -- cgit v1.2.3 From 4ff40b86262b73553ee47cc3784ce8ba0f220bd8 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jan 2019 02:42:09 +0800 Subject: sctp: set chunk transport correctly when it's a new asoc In the paths: sctp_sf_do_unexpected_init() -> sctp_make_init_ack() sctp_sf_do_dupcook_a/b()() -> sctp_sf_do_5_1D_ce() The new chunk 'retval' transport is set from the incoming chunk 'chunk' transport. However, 'retval' transport belong to the new asoc, which is a different one from 'chunk' transport's asoc. It will cause that the 'retval' chunk gets set with a wrong transport. Later when sending it and because of Commit b9fd683982c9 ("sctp: add sctp_packet_singleton"), sctp_packet_singleton() will set some fields, like vtag to 'retval' chunk from that wrong transport's asoc. This patch is to fix it by setting 'retval' transport correctly which belongs to the right asoc in sctp_make_init_ack() and sctp_sf_do_5_1D_ce(). Fixes: b9fd683982c9 ("sctp: add sctp_packet_singleton") Reported-by: Ying Xu Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f4ac6c592e13..d05c57664e36 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -495,7 +495,10 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, * * [INIT ACK back to where the INIT came from.] */ - retval->transport = chunk->transport; + if (chunk->transport) + retval->transport = + sctp_assoc_lookup_paddr(asoc, + &chunk->transport->ipaddr); retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(initack), &initack); @@ -642,8 +645,10 @@ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, * * [COOKIE ACK back to where the COOKIE ECHO came from.] */ - if (retval && chunk) - retval->transport = chunk->transport; + if (retval && chunk && chunk->transport) + retval->transport = + sctp_assoc_lookup_paddr(asoc, + &chunk->transport->ipaddr); return retval; } -- cgit v1.2.3 From ecf938fe7d0088077ee1280419a2b3c5429b47c8 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 22 Jan 2019 02:42:41 +0800 Subject: sctp: set flow sport from saddr only when it's 0 Now sctp_transport_pmtu() passes transport->saddr into .get_dst() to set flow sport from 'saddr'. However, transport->saddr is set only when transport->dst exists in sctp_transport_route(). If sctp_transport_pmtu() is called without transport->saddr set, like when transport->dst doesn't exists, the flow sport will be set to 0 from transport->saddr, which will cause a wrong route to be got. Commit 6e91b578bf3f ("sctp: re-use sctp_transport_pmtu in sctp_transport_route") made the issue be triggered more easily since sctp_transport_pmtu() would be called in sctp_transport_route() after that. In gerneral, fl4->fl4_sport should always be set to htons(asoc->base.bind_addr.port), unless transport->asoc doesn't exist in sctp_v4/6_get_dst(), which is the case: sctp_ootb_pkt_new() -> sctp_transport_route() For that, we can simply handle it by setting flow sport from saddr only when it's 0 in sctp_v4/6_get_dst(). Fixes: 6e91b578bf3f ("sctp: re-use sctp_transport_pmtu in sctp_transport_route") Reported-by: Ying Xu Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 3 ++- net/sctp/protocol.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ed8e006dae85..6200cd2b4b99 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -280,7 +280,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, if (saddr) { fl6->saddr = saddr->v6.sin6_addr; - fl6->fl6_sport = saddr->v6.sin6_port; + if (!fl6->fl6_sport) + fl6->fl6_sport = saddr->v6.sin6_port; pr_debug("src=%pI6 - ", &fl6->saddr); } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 4e0eeb113ef5..6abc8b274270 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -440,7 +440,8 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, } if (saddr) { fl4->saddr = saddr->v4.sin_addr.s_addr; - fl4->fl4_sport = saddr->v4.sin_port; + if (!fl4->fl4_sport) + fl4->fl4_sport = saddr->v4.sin_port; } pr_debug("%s: dst:%pI4, src:%pI4 - ", __func__, &fl4->daddr, -- cgit v1.2.3 From 7d652669b61d702c6e62a39579d17f6881670ab6 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 25 Jan 2019 08:21:26 +0100 Subject: batman-adv: release station info tidstats With the addition of TXQ stats in the per-tid statistics the struct station_info grew significantly. This resulted in stack size warnings due to the structure itself being above the limit for the warnings. To work around this, the TID array was allocated dynamically. Also a function to free this content was introduced with commit 7ea3e110f2f8 ("cfg80211: release station info tidstats where needed") but the necessary changes were not provided for batman-adv's B.A.T.M.A.N. V implementation. Signed-off-by: Felix Fietkau Fixes: 8689c051a201 ("cfg80211: dynamically allocate per-tid stats for station info") [sven@narfation.org: add commit message] Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index e8090f099eb8..ef0dec20c7d8 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -104,6 +104,9 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) ret = cfg80211_get_station(real_netdev, neigh->addr, &sinfo); + /* free the TID stats immediately */ + cfg80211_sinfo_release_content(&sinfo); + dev_put(real_netdev); if (ret == -ENOENT) { /* Node is not associated anymore! It would be -- cgit v1.2.3 From a8b5c6d69261889e022e9d64ac7ee8741db730bf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 25 Jan 2019 09:26:32 +0100 Subject: nl80211: fix NLA_POLICY_NESTED() arguments syzbot reported an out-of-bounds read when passing certain malformed messages into nl80211. The specific place where this happened isn't interesting, the problem is that nested policy parsing was referring to the wrong maximum attribute and thus the policy wasn't long enough. Fix this by referring to the correct attribute. Since this is really not necessary, I'll come up with a separate patch to just pass the policy instead of both, in the common case we can infer the maxattr from the size of the policy array. Reported-by: syzbot+4157b036c5f4713b1f2f@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Fixes: 9bb7e0f24e7e ("cfg80211: add peer measurement with FTM initiator API") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5e49492d5911..74150ad95823 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -555,7 +555,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { }, [NL80211_ATTR_TIMEOUT] = NLA_POLICY_MIN(NLA_U32, 1), [NL80211_ATTR_PEER_MEASUREMENTS] = - NLA_POLICY_NESTED(NL80211_PMSR_FTM_REQ_ATTR_MAX, + NLA_POLICY_NESTED(NL80211_PMSR_ATTR_MAX, nl80211_pmsr_attr_policy), }; -- cgit v1.2.3 From 7ed5285396c257fd4070b1e29e7b2341aae2a1ce Mon Sep 17 00:00:00 2001 From: Balaji Pothunoori Date: Mon, 21 Jan 2019 12:30:43 +0530 Subject: mac80211: don't initiate TDLS connection if station is not associated to AP Following call trace is observed while adding TDLS peer entry in driver during TDLS setup. Call Trace: [] dump_stack+0x47/0x61 [] __warn+0xe2/0x100 [] ? sta_apply_parameters+0x49f/0x550 [mac80211] [] warn_slowpath_null+0x25/0x30 [] sta_apply_parameters+0x49f/0x550 [mac80211] [] ? sta_info_alloc+0x1c2/0x450 [mac80211] [] ieee80211_add_station+0xe3/0x160 [mac80211] [] nl80211_new_station+0x273/0x420 [] genl_rcv_msg+0x219/0x3c0 [] ? genl_rcv+0x30/0x30 [] netlink_rcv_skb+0x8e/0xb0 [] genl_rcv+0x1c/0x30 [] netlink_unicast+0x13a/0x1d0 [] netlink_sendmsg+0x2d8/0x390 [] sock_sendmsg+0x2d/0x40 [] ___sys_sendmsg+0x1d9/0x1e0 Fixing this by allowing TDLS setup request only when we have completed association. Signed-off-by: Balaji Pothunoori Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index de65fe3ed9cc..2493c74c2d37 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1490,6 +1490,10 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) sta->sta.tdls = true; + if (sta->sta.tdls && sdata->vif.type == NL80211_IFTYPE_STATION && + !sdata->u.mgd.associated) + return -EINVAL; + err = sta_apply_parameters(local, sta, params); if (err) { sta_info_free(local, sta); -- cgit v1.2.3 From 7c53eb5d87bc21464da4268c3c0c47457b6d9c9b Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 24 Jan 2019 19:19:57 +0100 Subject: mac80211: Add attribute aligned(2) to struct 'action' During refactor in commit 9e478066eae4 ("mac80211: fix MU-MIMO follow-MAC mode") a new struct 'action' was declared with packed attribute as: struct { struct ieee80211_hdr_3addr hdr; u8 category; u8 action_code; } __packed action; But since struct 'ieee80211_hdr_3addr' is declared with an aligned keyword as: struct ieee80211_hdr { __le16 frame_control; __le16 duration_id; u8 addr1[ETH_ALEN]; u8 addr2[ETH_ALEN]; u8 addr3[ETH_ALEN]; __le16 seq_ctrl; u8 addr4[ETH_ALEN]; } __packed __aligned(2); Solve the ambiguity of placing aligned structure in a packed one by adding the aligned(2) attribute to struct 'action'. This removes the following warning (W=1): net/mac80211/rx.c:234:2: warning: alignment 1 of 'struct ' is less than 2 [-Wpacked-not-aligned] Cc: Johannes Berg Suggested-by: Johannes Berg Signed-off-by: Mathieu Malaterre Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 27a337bc8acf..bb4d71efb6fb 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -231,7 +231,7 @@ static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr_3addr hdr; u8 category; u8 action_code; - } __packed action; + } __packed __aligned(2) action; if (!sdata) return; -- cgit v1.2.3 From faae54ad4151f120bde60fd6e2b2a40e0de8ac73 Mon Sep 17 00:00:00 2001 From: Chaitanya Tata Date: Thu, 24 Jan 2019 16:13:02 +0530 Subject: cfg80211: reg: remove warn_on for a normal case If there are simulatenous queries of regdb, then there might be a case where multiple queries can trigger request_firmware_no_wait and can have parallel callbacks being executed asynchronously. In this scenario we might hit the WARN_ON. So remove the warn_on, as the code already handles multiple callbacks gracefully. Signed-off-by: Chaitanya Tata Signed-off-by: Johannes Berg --- net/wireless/reg.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index ecfb1a06dbb2..35399a825aed 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1024,8 +1024,13 @@ static void regdb_fw_cb(const struct firmware *fw, void *context) } rtnl_lock(); - if (WARN_ON(regdb && !IS_ERR(regdb))) { - /* just restore and free new db */ + if (regdb && !IS_ERR(regdb)) { + /* negative case - a bug + * positive case - can happen due to race in case of multiple cb's in + * queue, due to usage of asynchronous callback + * + * Either case, just restore and free new db. + */ } else if (set_error) { regdb = ERR_PTR(set_error); } else if (fw) { -- cgit v1.2.3 From 93183bdbe73bbdd03e9566c8dc37c9d06b0d0db6 Mon Sep 17 00:00:00 2001 From: Chaitanya Tata Date: Sat, 19 Jan 2019 03:17:47 +0530 Subject: cfg80211: extend range deviation for DMG Recently, DMG frequency bands have been extended till 71GHz, so extend the range check till 20GHz (45-71GHZ), else some channels will be marked as disabled. Signed-off-by: Chaitanya Tata Signed-off-by: Johannes Berg --- net/wireless/reg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 35399a825aed..dd58b9909ac9 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1260,7 +1260,7 @@ static bool is_valid_rd(const struct ieee80211_regdomain *rd) * definitions (the "2.4 GHz band", the "5 GHz band" and the "60GHz band"), * however it is safe for now to assume that a frequency rule should not be * part of a frequency's band if the start freq or end freq are off by more - * than 2 GHz for the 2.4 and 5 GHz bands, and by more than 10 GHz for the + * than 2 GHz for the 2.4 and 5 GHz bands, and by more than 20 GHz for the * 60 GHz band. * This resolution can be lowered and should be considered as we add * regulatory rule support for other "bands". @@ -1275,7 +1275,7 @@ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range, * with the Channel starting frequency above 45 GHz. */ u32 limit = freq_khz > 45 * ONE_GHZ_IN_KHZ ? - 10 * ONE_GHZ_IN_KHZ : 2 * ONE_GHZ_IN_KHZ; + 20 * ONE_GHZ_IN_KHZ : 2 * ONE_GHZ_IN_KHZ; if (abs(freq_khz - freq_range->start_freq_khz) <= limit) return true; if (abs(freq_khz - freq_range->end_freq_khz) <= limit) -- cgit v1.2.3 From 63346650c1a94a92be61a57416ac88c0a47c4327 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 24 Jan 2019 14:18:18 -0800 Subject: netrom: switch to sock timer API sk_reset_timer() and sk_stop_timer() properly handle sock refcnt for timer function. Switching to them could fix a refcounting bug reported by syzbot. Reported-and-tested-by: syzbot+defa700d16f1bd1b9a05@syzkaller.appspotmail.com Cc: Ralf Baechle Cc: linux-hams@vger.kernel.org Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/netrom/nr_timer.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index cbd51ed5a2d7..908e53ab47a4 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -52,21 +52,21 @@ void nr_start_t1timer(struct sock *sk) { struct nr_sock *nr = nr_sk(sk); - mod_timer(&nr->t1timer, jiffies + nr->t1); + sk_reset_timer(sk, &nr->t1timer, jiffies + nr->t1); } void nr_start_t2timer(struct sock *sk) { struct nr_sock *nr = nr_sk(sk); - mod_timer(&nr->t2timer, jiffies + nr->t2); + sk_reset_timer(sk, &nr->t2timer, jiffies + nr->t2); } void nr_start_t4timer(struct sock *sk) { struct nr_sock *nr = nr_sk(sk); - mod_timer(&nr->t4timer, jiffies + nr->t4); + sk_reset_timer(sk, &nr->t4timer, jiffies + nr->t4); } void nr_start_idletimer(struct sock *sk) @@ -74,37 +74,37 @@ void nr_start_idletimer(struct sock *sk) struct nr_sock *nr = nr_sk(sk); if (nr->idle > 0) - mod_timer(&nr->idletimer, jiffies + nr->idle); + sk_reset_timer(sk, &nr->idletimer, jiffies + nr->idle); } void nr_start_heartbeat(struct sock *sk) { - mod_timer(&sk->sk_timer, jiffies + 5 * HZ); + sk_reset_timer(sk, &sk->sk_timer, jiffies + 5 * HZ); } void nr_stop_t1timer(struct sock *sk) { - del_timer(&nr_sk(sk)->t1timer); + sk_stop_timer(sk, &nr_sk(sk)->t1timer); } void nr_stop_t2timer(struct sock *sk) { - del_timer(&nr_sk(sk)->t2timer); + sk_stop_timer(sk, &nr_sk(sk)->t2timer); } void nr_stop_t4timer(struct sock *sk) { - del_timer(&nr_sk(sk)->t4timer); + sk_stop_timer(sk, &nr_sk(sk)->t4timer); } void nr_stop_idletimer(struct sock *sk) { - del_timer(&nr_sk(sk)->idletimer); + sk_stop_timer(sk, &nr_sk(sk)->idletimer); } void nr_stop_heartbeat(struct sock *sk) { - del_timer(&sk->sk_timer); + sk_stop_timer(sk, &sk->sk_timer); } int nr_t1timer_running(struct sock *sk) -- cgit v1.2.3 From b0cf029234f9b18e10703ba5147f0389c382bccc Mon Sep 17 00:00:00 2001 From: Bernard Pidoux Date: Fri, 25 Jan 2019 11:46:40 +0100 Subject: net/rose: fix NULL ax25_cb kernel panic When an internally generated frame is handled by rose_xmit(), rose_route_frame() is called: if (!rose_route_frame(skb, NULL)) { dev_kfree_skb(skb); stats->tx_errors++; return NETDEV_TX_OK; } We have the same code sequence in Net/Rom where an internally generated frame is handled by nr_xmit() calling nr_route_frame(skb, NULL). However, in this function NULL argument is tested while it is not in rose_route_frame(). Then kernel panic occurs later on when calling ax25cmp() with a NULL ax25_cb argument as reported many times and recently with syzbot. We need to test if ax25 is NULL before using it. Testing: Built kernel with CONFIG_ROSE=y. Signed-off-by: Bernard Pidoux Acked-by: Dmitry Vyukov Reported-by: syzbot+1a2c456a1ea08fa5b5f7@syzkaller.appspotmail.com Cc: "David S. Miller" Cc: Ralf Baechle Cc: Bernard Pidoux Cc: linux-hams@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- net/rose/rose_route.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 77e9f85a2c92..f2ff21d7df08 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -850,6 +850,7 @@ void rose_link_device_down(struct net_device *dev) /* * Route a frame to an appropriate AX.25 connection. + * A NULL ax25_cb indicates an internally generated frame. */ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) { @@ -867,6 +868,10 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) if (skb->len < ROSE_MIN_LEN) return res; + + if (!ax25) + return rose_loopback_queue(skb, NULL); + frametype = skb->data[2]; lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); if (frametype == ROSE_CALL_REQUEST && -- cgit v1.2.3 From 50c2936634bcb1db78a8ca63249236810c11a80f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 26 Jan 2019 21:12:19 +0100 Subject: decnet: fix DN_IFREQ_SIZE Digging through the ioctls with Al because of the previous patches, we found that on 64-bit decnet's dn_dev_ioctl() is wrong, because struct ifreq::ifr_ifru is actually 24 bytes (not 16 as expected from struct sockaddr) due to the ifru_map and ifru_settings members. Clearly, decnet expects the ioctl to be called with a struct like struct ifreq_dn { char ifr_name[IFNAMSIZ]; struct sockaddr_dn ifr_addr; }; since it does struct ifreq *ifr = ...; struct sockaddr_dn *sdn = (struct sockaddr_dn *)&ifr->ifr_addr; This means that DN_IFREQ_SIZE is too big for what it wants on 64-bit, as it is sizeof(struct ifreq) - sizeof(struct sockaddr) + sizeof(struct sockaddr_dn) This assumes that sizeof(struct sockaddr) is the size of ifr_ifru but that isn't true. Fix this to use offsetof(struct ifreq, ifr_ifru). This indeed doesn't really matter much - the result is that we copy in/out 8 bytes more than we should on 64-bit platforms. In case the "struct ifreq_dn" lands just on the end of a page though it might lead to faults. As far as I can tell, it has been like this forever, so it seems very likely that nobody cares. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/decnet/dn_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index d0b3e69c6b39..0962f9201baa 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -56,7 +56,7 @@ #include #include -#define DN_IFREQ_SIZE (sizeof(struct ifreq) - sizeof(struct sockaddr) + sizeof(struct sockaddr_dn)) +#define DN_IFREQ_SIZE (offsetof(struct ifreq, ifr_ifru) + sizeof(struct sockaddr_dn)) static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00}; static char dn_rt_all_rt_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x03,0x00,0x00}; -- cgit v1.2.3 From 146820cc240f4389cf33481c058d9493aef95e25 Mon Sep 17 00:00:00 2001 From: Nir Dotan Date: Sun, 27 Jan 2019 09:26:22 +0200 Subject: ip6mr: Fix notifiers call on mroute_clean_tables() When the MC route socket is closed, mroute_clean_tables() is called to cleanup existing routes. Mistakenly notifiers call was put on the cleanup of the unresolved MC route entries cache. In a case where the MC socket closes before an unresolved route expires, the notifier call leads to a crash, caused by the driver trying to increment a non initialized refcount_t object [1] and then when handling is done, to decrement it [2]. This was detected by a test recently added in commit 6d4efada3b82 ("selftests: forwarding: Add multicast routing test"). Fix that by putting notifiers call on the resolved entries traversal, instead of on the unresolved entries traversal. [1] [ 245.748967] refcount_t: increment on 0; use-after-free. [ 245.754829] WARNING: CPU: 3 PID: 3223 at lib/refcount.c:153 refcount_inc_checked+0x2b/0x30 ... [ 245.802357] Hardware name: Mellanox Technologies Ltd. MSN2740/SA001237, BIOS 5.6.5 06/07/2016 [ 245.811873] RIP: 0010:refcount_inc_checked+0x2b/0x30 ... [ 245.907487] Call Trace: [ 245.910231] mlxsw_sp_router_fib_event.cold.181+0x42/0x47 [mlxsw_spectrum] [ 245.917913] notifier_call_chain+0x45/0x7 [ 245.922484] atomic_notifier_call_chain+0x15/0x20 [ 245.927729] call_fib_notifiers+0x15/0x30 [ 245.932205] mroute_clean_tables+0x372/0x3f [ 245.936971] ip6mr_sk_done+0xb1/0xc0 [ 245.940960] ip6_mroute_setsockopt+0x1da/0x5f0 ... [2] [ 246.128487] refcount_t: underflow; use-after-free. [ 246.133859] WARNING: CPU: 0 PID: 7 at lib/refcount.c:187 refcount_sub_and_test_checked+0x4c/0x60 [ 246.183521] Hardware name: Mellanox Technologies Ltd. MSN2740/SA001237, BIOS 5.6.5 06/07/2016 ... [ 246.193062] Workqueue: mlxsw_core_ordered mlxsw_sp_router_fibmr_event_work [mlxsw_spectrum] [ 246.202394] RIP: 0010:refcount_sub_and_test_checked+0x4c/0x60 ... [ 246.298889] Call Trace: [ 246.301617] refcount_dec_and_test_checked+0x11/0x20 [ 246.307170] mlxsw_sp_router_fibmr_event_work.cold.196+0x47/0x78 [mlxsw_spectrum] [ 246.315531] process_one_work+0x1fa/0x3f0 [ 246.320005] worker_thread+0x2f/0x3e0 [ 246.324083] kthread+0x118/0x130 [ 246.327683] ? wq_update_unbound_numa+0x1b0/0x1b0 [ 246.332926] ? kthread_park+0x80/0x80 [ 246.337013] ret_from_fork+0x1f/0x30 Fixes: 088aa3eec2ce ("ip6mr: Support fib notifications") Signed-off-by: Nir Dotan Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 30337b38274b..cc01aa3f2b5e 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1516,6 +1516,9 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); + call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), + FIB_EVENT_ENTRY_DEL, + (struct mfc6_cache *)c, mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); mr_cache_put(c); } @@ -1524,10 +1527,6 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); - call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), - FIB_EVENT_ENTRY_DEL, - (struct mfc6_cache *)c, - mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); -- cgit v1.2.3 From 2035f3ff8eaa29cfb5c8e2160b0f6e85eeb21a95 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 21 Jan 2019 21:54:36 +0100 Subject: netfilter: ebtables: compat: un-break 32bit setsockopt when no rules are present Unlike ip(6)tables ebtables only counts user-defined chains. The effect is that a 32bit ebtables binary on a 64bit kernel can do 'ebtables -N FOO' only after adding at least one rule, else the request fails with -EINVAL. This is a similar fix as done in 3f1e53abff84 ("netfilter: ebtables: don't attempt to allocate 0-sized compat array"). Fixes: 7d7d7e02111e9 ("netfilter: compat: reject huge allocation requests") Reported-by: Francesco Ruggeri Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5e55cef0cec3..6693e209efe8 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2293,9 +2293,12 @@ static int compat_do_replace(struct net *net, void __user *user, xt_compat_lock(NFPROTO_BRIDGE); - ret = xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries); - if (ret < 0) - goto out_unlock; + if (tmp.nentries) { + ret = xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries); + if (ret < 0) + goto out_unlock; + } + ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); if (ret < 0) goto out_unlock; -- cgit v1.2.3 From 1a6a0951fc009f6d9fe8ebea2d2417d80d54097b Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Mon, 21 Jan 2019 12:53:21 +0100 Subject: netfilter: nfnetlink_osf: add missing fmatch check When we check the tcp options of a packet and it doesn't match the current fingerprint, the tcp packet option pointer must be restored to its initial value in order to do the proper tcp options check for the next fingerprint. Here we can see an example. Assumming the following fingerprint base with two lines: S10:64:1:60:M*,S,T,N,W6: Linux:3.0::Linux 3.0 S20:64:1:60:M*,S,T,N,W7: Linux:4.19:arch:Linux 4.1 Where TCP options are the last field in the OS signature, all of them overlap except by the last one, ie. 'W6' versus 'W7'. In case a packet for Linux 4.19 kicks in, the osf finds no matching because the TCP options pointer is updated after checking for the TCP options in the first line. Therefore, reset pointer back to where it should be. Fixes: 11eeef41d5f6 ("netfilter: passive OS fingerprint xtables match") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_osf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 6f41dd74729d..1f1d90c1716b 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -66,6 +66,7 @@ static bool nf_osf_match_one(const struct sk_buff *skb, int ttl_check, struct nf_osf_hdr_ctx *ctx) { + const __u8 *optpinit = ctx->optp; unsigned int check_WSS = 0; int fmatch = FMATCH_WRONG; int foptsize, optnum; @@ -155,6 +156,9 @@ static bool nf_osf_match_one(const struct sk_buff *skb, } } + if (fmatch != FMATCH_OK) + ctx->optp = optpinit; + return fmatch == FMATCH_OK; } -- cgit v1.2.3 From 206b8cc514d7ff2b79dd2d5ad939adc7c493f07a Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Wed, 23 Jan 2019 12:48:11 +0100 Subject: netfilter: ipt_CLUSTERIP: fix warning unused variable cn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_PROC_FS isn't set the variable cn isn't used. net/ipv4/netfilter/ipt_CLUSTERIP.c: In function ‘clusterip_net_exit’: net/ipv4/netfilter/ipt_CLUSTERIP.c:849:24: warning: unused variable ‘cn’ [-Wunused-variable] struct clusterip_net *cn = clusterip_pernet(net); ^~ Rework so the variable 'cn' is declared inside "#ifdef CONFIG_PROC_FS". Fixes: b12f7bad5ad3 ("netfilter: ipt_CLUSTERIP: remove wrong WARN_ON_ONCE in netns exit routine") Signed-off-by: Anders Roxell Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index b61977db9b7f..2a909e5f9ba0 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -846,9 +846,9 @@ static int clusterip_net_init(struct net *net) static void clusterip_net_exit(struct net *net) { +#ifdef CONFIG_PROC_FS struct clusterip_net *cn = clusterip_pernet(net); -#ifdef CONFIG_PROC_FS mutex_lock(&cn->mutex); proc_remove(cn->procdir); cn->procdir = NULL; -- cgit v1.2.3 From 1d79895aef18fa05789995d86d523c9b2ee58a02 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 28 Jan 2019 10:13:35 +0100 Subject: sk_msg: Always cancel strp work before freeing the psock Despite having stopped the parser, we still need to deinitialize it by calling strp_done so that it cancels its work. Otherwise the worker thread can run after we have freed the parser, and attempt to access its workqueue resulting in a use-after-free: ================================================================== BUG: KASAN: use-after-free in pwq_activate_delayed_work+0x1b/0x1d0 Read of size 8 at addr ffff888069975240 by task kworker/u2:2/93 CPU: 0 PID: 93 Comm: kworker/u2:2 Not tainted 5.0.0-rc2-00335-g28f9d1a3d4fe-dirty #14 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014 Workqueue: (null) (kstrp) Call Trace: print_address_description+0x6e/0x2b0 ? pwq_activate_delayed_work+0x1b/0x1d0 kasan_report+0xfd/0x177 ? pwq_activate_delayed_work+0x1b/0x1d0 ? pwq_activate_delayed_work+0x1b/0x1d0 pwq_activate_delayed_work+0x1b/0x1d0 ? process_one_work+0x4aa/0x660 pwq_dec_nr_in_flight+0x9b/0x100 worker_thread+0x82/0x680 ? process_one_work+0x660/0x660 kthread+0x1b9/0x1e0 ? __kthread_create_on_node+0x250/0x250 ret_from_fork+0x1f/0x30 Allocated by task 111: sk_psock_init+0x3c/0x1b0 sock_map_link.isra.2+0x103/0x4b0 sock_map_update_common+0x94/0x270 sock_map_update_elem+0x145/0x160 __se_sys_bpf+0x152e/0x1e10 do_syscall_64+0xb2/0x3e0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 112: kfree+0x7f/0x140 process_one_work+0x40b/0x660 worker_thread+0x82/0x680 kthread+0x1b9/0x1e0 ret_from_fork+0x1f/0x30 The buggy address belongs to the object at ffff888069975180 which belongs to the cache kmalloc-512 of size 512 The buggy address is located 192 bytes inside of 512-byte region [ffff888069975180, ffff888069975380) The buggy address belongs to the page: page:ffffea0001a65d00 count:1 mapcount:0 mapping:ffff88806d401280 index:0x0 compound_mapcount: 0 flags: 0x4000000000010200(slab|head) raw: 4000000000010200 dead000000000100 dead000000000200 ffff88806d401280 raw: 0000000000000000 00000000800c000c 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888069975100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888069975180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff888069975200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888069975280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888069975300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Reported-by: Marek Majkowski Signed-off-by: Jakub Sitnicki Link: https://lore.kernel.org/netdev/CAJPywTLwgXNEZ2dZVoa=udiZmtrWJ0q5SuBW64aYs0Y1khXX3A@mail.gmail.com Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- net/core/skmsg.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d6d5c20d7044..8c826603bf36 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -545,8 +545,7 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) struct sk_psock *psock = container_of(gc, struct sk_psock, gc); /* No sk_callback_lock since already detached. */ - if (psock->parser.enabled) - strp_done(&psock->parser.strp); + strp_done(&psock->parser.strp); cancel_work_sync(&psock->work); -- cgit v1.2.3 From 32eb67b93c9e3cd62cb423e30b090cdd4aa8d275 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Sun, 27 Jan 2019 00:57:38 +0000 Subject: net: tls: Save iv in tls_rec for async crypto requests aead_request_set_crypt takes an iv pointer, and we change the iv soon after setting it. Some async crypto algorithms don't save the iv, so we need to save it in the tls_rec for async requests. Found by hardcoding x64 aesni to use async crypto manager (to test the async codepath), however I don't think this combination can happen in the wild. Presumably other hardware offloads will need this fix, but there have been no user reports. Fixes: a42055e8d2c30 ("Add support for async encryption of records...") Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 2 ++ net/tls/tls_sw.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 2a6ac8d642af..1486b60c4de8 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -120,6 +120,8 @@ struct tls_rec { struct scatterlist sg_aead_out[2]; char aad_space[TLS_AAD_SPACE_SIZE]; + u8 iv_data[TLS_CIPHER_AES_GCM_128_IV_SIZE + + TLS_CIPHER_AES_GCM_128_SALT_SIZE]; struct aead_request aead_req; u8 aead_req_ctx[]; }; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 11cdc8f7db63..7e963560edef 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -439,6 +439,8 @@ static int tls_do_encryption(struct sock *sk, struct scatterlist *sge = sk_msg_elem(msg_en, start); int rc; + memcpy(rec->iv_data, tls_ctx->tx.iv, sizeof(rec->iv_data)); + sge->offset += tls_ctx->tx.prepend_size; sge->length -= tls_ctx->tx.prepend_size; @@ -448,7 +450,7 @@ static int tls_do_encryption(struct sock *sk, aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, rec->sg_aead_in, rec->sg_aead_out, - data_len, tls_ctx->tx.iv); + data_len, rec->iv_data); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, tls_encrypt_done, sk); -- cgit v1.2.3 From 1023121375c6b0b3dc00334983c762ba2b76cb19 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Sun, 27 Jan 2019 00:59:03 +0000 Subject: net: tls: Fix deadlock in free_resources tx If there are outstanding async tx requests (when crypto returns EINPROGRESS), there is a potential deadlock: the tx work acquires the lock, while we cancel_delayed_work_sync() while holding the lock. Drop the lock while waiting for the work to complete. Fixes: a42055e8d2c30 ("Add support for async encryption of records...") Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7e963560edef..bf5b54b513bc 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1794,7 +1794,9 @@ void tls_sw_free_resources_tx(struct sock *sk) if (atomic_read(&ctx->encrypt_pending)) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + release_sock(sk); cancel_delayed_work_sync(&ctx->tx_work.work); + lock_sock(sk); /* Tx whatever records we can transmit and abandon the rest */ tls_tx_records(sk, -1); -- cgit v1.2.3 From 35edfdc77f683c8fd27d7732af06cf6489af60a5 Mon Sep 17 00:00:00 2001 From: Josh Elsasser Date: Sat, 26 Jan 2019 14:38:33 -0800 Subject: net: set default network namespace in init_dummy_netdev() Assign a default net namespace to netdevs created by init_dummy_netdev(). Fixes a NULL pointer dereference caused by busy-polling a socket bound to an iwlwifi wireless device, which bumps the per-net BUSYPOLLRXPACKETS stat if napi_poll() received packets: BUG: unable to handle kernel NULL pointer dereference at 0000000000000190 IP: napi_busy_loop+0xd6/0x200 Call Trace: sock_poll+0x5e/0x80 do_sys_poll+0x324/0x5a0 SyS_poll+0x6c/0xf0 do_syscall_64+0x6b/0x1f0 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: 7db6b048da3b ("net: Commonize busy polling code to focus on napi_id instead of socket") Signed-off-by: Josh Elsasser Signed-off-by: David S. Miller --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 82f20022259d..8e276e0192a1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8712,6 +8712,9 @@ int init_dummy_netdev(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); + /* napi_busy_loop stats accounting wants this */ + dev_net_set(dev, &init_net); + /* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount. -- cgit v1.2.3 From 63ff03ab786ab1bc6cca01d48eacd22c95b9b3eb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 25 Jan 2019 22:43:17 +0100 Subject: Revert "socket: fix struct ifreq size in compat ioctl" This reverts commit 1cebf8f143c2 ("socket: fix struct ifreq size in compat ioctl"), it's a bugfix for another commit that I'll revert next. This is not a 'perfect' revert, I'm keeping some coding style intact rather than revert to the state with indentation errors. Cc: stable@vger.kernel.org Fixes: 1cebf8f143c2 ("socket: fix struct ifreq size in compat ioctl") Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/socket.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index e89884e2197b..63b53af7379b 100644 --- a/net/socket.c +++ b/net/socket.c @@ -941,8 +941,7 @@ void dlci_ioctl_set(int (*hook) (unsigned int, void __user *)) EXPORT_SYMBOL(dlci_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, - unsigned int cmd, unsigned long arg, - unsigned int ifreq_size) + unsigned int cmd, unsigned long arg) { int err; void __user *argp = (void __user *)arg; @@ -968,11 +967,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, } else { struct ifreq ifr; bool need_copyout; - if (copy_from_user(&ifr, argp, ifreq_size)) + if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) return -EFAULT; err = dev_ioctl(net, cmd, &ifr, &need_copyout); if (!err && need_copyout) - if (copy_to_user(argp, &ifr, ifreq_size)) + if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) return -EFAULT; } return err; @@ -1071,8 +1070,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = open_related_ns(&net->ns, get_net_ns); break; default: - err = sock_do_ioctl(net, sock, cmd, arg, - sizeof(struct ifreq)); + err = sock_do_ioctl(net, sock, cmd, arg); break; } return err; @@ -2780,8 +2778,7 @@ static int do_siocgstamp(struct net *net, struct socket *sock, int err; set_fs(KERNEL_DS); - err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv, - sizeof(struct compat_ifreq)); + err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv); set_fs(old_fs); if (!err) err = compat_put_timeval(&ktv, up); @@ -2797,8 +2794,7 @@ static int do_siocgstampns(struct net *net, struct socket *sock, int err; set_fs(KERNEL_DS); - err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts, - sizeof(struct compat_ifreq)); + err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts); set_fs(old_fs); if (!err) err = compat_put_timespec(&kts, up); @@ -3109,8 +3105,7 @@ static int routing_ioctl(struct net *net, struct socket *sock, } set_fs(KERNEL_DS); - ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r, - sizeof(struct compat_ifreq)); + ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r); set_fs(old_fs); out: @@ -3223,8 +3218,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: case SIOCGIFNAME: - return sock_do_ioctl(net, sock, cmd, arg, - sizeof(struct compat_ifreq)); + return sock_do_ioctl(net, sock, cmd, arg); } return -ENOIOCTLCMD; -- cgit v1.2.3 From 37ac39bdddc528c998a9f36db36937de923fdf2a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 25 Jan 2019 22:43:18 +0100 Subject: Revert "kill dev_ifsioc()" This reverts commit bf4405737f9f ("kill dev_ifsioc()"). This wasn't really unused as implied by the original commit, it still handles the copy to/from user differently, and the commit thus caused issues such as https://bugzilla.kernel.org/show_bug.cgi?id=199469 and https://bugzilla.kernel.org/show_bug.cgi?id=202273 However, deviating from a strict revert, rename dev_ifsioc() to compat_ifreq_ioctl() to be clearer as to its purpose and add a comment. Cc: stable@vger.kernel.org Fixes: bf4405737f9f ("kill dev_ifsioc()") Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/socket.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 63b53af7379b..fbf80f9fb057 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2990,6 +2990,53 @@ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd, return dev_ioctl(net, cmd, &ifreq, NULL); } +static int compat_ifreq_ioctl(struct net *net, struct socket *sock, + unsigned int cmd, + struct compat_ifreq __user *uifr32) +{ + struct ifreq __user *uifr; + int err; + + /* Handle the fact that while struct ifreq has the same *layout* on + * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data, + * which are handled elsewhere, it still has different *size* due to + * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit, + * resulting in struct ifreq being 32 and 40 bytes respectively). + * As a result, if the struct happens to be at the end of a page and + * the next page isn't readable/writable, we get a fault. To prevent + * that, copy back and forth to the full size. + */ + + uifr = compat_alloc_user_space(sizeof(*uifr)); + if (copy_in_user(uifr, uifr32, sizeof(*uifr32))) + return -EFAULT; + + err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr); + + if (!err) { + switch (cmd) { + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFMEM: + case SIOCGIFHWADDR: + case SIOCGIFINDEX: + case SIOCGIFADDR: + case SIOCGIFBRDADDR: + case SIOCGIFDSTADDR: + case SIOCGIFNETMASK: + case SIOCGIFPFLAGS: + case SIOCGIFTXQLEN: + case SIOCGMIIPHY: + case SIOCGMIIREG: + if (copy_in_user(uifr32, uifr, sizeof(*uifr32))) + err = -EFAULT; + break; + } + } + return err; +} + static int compat_sioc_ifmap(struct net *net, unsigned int cmd, struct compat_ifreq __user *uifr32) { @@ -3209,6 +3256,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: + return compat_ifreq_ioctl(net, sock, cmd, argp); + case SIOCSARP: case SIOCGARP: case SIOCDARP: -- cgit v1.2.3 From c6c9fee35dc27362b7bac34b2fc9f5b8ace2e22c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 25 Jan 2019 22:43:19 +0100 Subject: net: socket: fix SIOCGIFNAME in compat As reported by Robert O'Callahan in https://bugzilla.kernel.org/show_bug.cgi?id=202273 reverting the previous changes in this area broke the SIOCGIFNAME ioctl in compat again (I'd previously fixed it after his previous report of breakage in https://bugzilla.kernel.org/show_bug.cgi?id=199469). This is obviously because I fixed SIOCGIFNAME more or less by accident. Fix it explicitly now by making it pass through the restored compat translation code. Cc: stable@vger.kernel.org Fixes: 4cf808e7ac32 ("kill dev_ifname32()") Reported-by: Robert O'Callahan Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/socket.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index fbf80f9fb057..473ac8d7c54e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3029,6 +3029,7 @@ static int compat_ifreq_ioctl(struct net *net, struct socket *sock, case SIOCGIFTXQLEN: case SIOCGMIIPHY: case SIOCGMIIREG: + case SIOCGIFNAME: if (copy_in_user(uifr32, uifr, sizeof(*uifr32))) err = -EFAULT; break; @@ -3252,6 +3253,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCSIFTXQLEN: case SIOCBRADDIF: case SIOCBRDELIF: + case SIOCGIFNAME: case SIOCSIFNAME: case SIOCGMIIPHY: case SIOCGMIIREG: @@ -3266,7 +3268,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: - case SIOCGIFNAME: return sock_do_ioctl(net, sock, cmd, arg); } -- cgit v1.2.3 From 98406133dd9cb9f195676eab540c270dceca879a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 25 Jan 2019 22:43:20 +0100 Subject: net: socket: make bond ioctls go through compat_ifreq_ioctl() Same story as before, these use struct ifreq and thus need to be read with the shorter version to not cause faults. Cc: stable@vger.kernel.org Fixes: f92d4fc95341 ("kill bond_ioctl()") Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/socket.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 473ac8d7c54e..d80d87a395ea 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3258,16 +3258,16 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDCHANGEACTIVE: return compat_ifreq_ioctl(net, sock, cmd, argp); case SIOCSARP: case SIOCGARP: case SIOCDARP: case SIOCATMARK: - case SIOCBONDENSLAVE: - case SIOCBONDRELEASE: - case SIOCBONDSETHWADDR: - case SIOCBONDCHANGEACTIVE: return sock_do_ioctl(net, sock, cmd, arg); } -- cgit v1.2.3 From feaf5c796b3f0240f10d0d6d0b686715fd58a05b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 28 Jan 2019 22:23:48 +0100 Subject: net: ip_gre: always reports o_key to userspace Erspan protocol (version 1 and 2) relies on o_key to configure session id header field. However TUNNEL_KEY bit is cleared in erspan_xmit since ERSPAN protocol does not set the key field of the external GRE header and so the configured o_key is not reported to userspace. The issue can be triggered with the following reproducer: $ip link add erspan1 type erspan local 192.168.0.1 remote 192.168.0.2 \ key 1 seq erspan_ver 1 $ip link set erspan1 up $ip -d link sh erspan1 erspan1@NONE: mtu 1450 qdisc pfifo_fast state UNKNOWN mode DEFAULT link/ether 52:aa:99:95:9a:b5 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 1500 erspan remote 192.168.0.2 local 192.168.0.1 ttl inherit ikey 0.0.0.1 iseq oseq erspan_index 0 Fix the issue adding TUNNEL_KEY bit to the o_flags parameter in ipgre_fill_info Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 20a64fe6254b..3978f807fa8b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1455,12 +1455,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm *p = &t->parms; + __be16 o_flags = p->o_flags; + + if ((t->erspan_ver == 1 || t->erspan_ver == 2) && + !t->collect_md) + o_flags |= TUNNEL_KEY; if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, gre_tnl_flags_to_gre_flags(p->i_flags)) || nla_put_be16(skb, IFLA_GRE_OFLAGS, - gre_tnl_flags_to_gre_flags(p->o_flags)) || + gre_tnl_flags_to_gre_flags(o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) || -- cgit v1.2.3 From c706863bc8902d0c2d1a5a27ac8e1ead5d06b79d Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 28 Jan 2019 22:23:49 +0100 Subject: net: ip6_gre: always reports o_key to userspace As Erspan_v4, Erspan_v6 protocol relies on o_key to configure session id header field. However TUNNEL_KEY bit is cleared in ip6erspan_tunnel_xmit since ERSPAN protocol does not set the key field of the external GRE header and so the configured o_key is not reported to userspace. The issue can be triggered with the following reproducer: $ip link add ip6erspan1 type ip6erspan local 2000::1 remote 2000::2 \ key 1 seq erspan_ver 1 $ip link set ip6erspan1 up ip -d link sh ip6erspan1 ip6erspan1@NONE: mtu 1422 qdisc noop state DOWN mode DEFAULT link/ether ba:ff:09:24:c3:0e brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 1500 ip6erspan remote 2000::2 local 2000::1 encaplimit 4 flowlabel 0x00000 ikey 0.0.0.1 iseq oseq Fix the issue adding TUNNEL_KEY bit to the o_flags parameter in ip6gre_fill_info Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4416368dbd49..801a9a0c217e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -2098,12 +2098,17 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm *p = &t->parms; + __be16 o_flags = p->o_flags; + + if ((p->erspan_ver == 1 || p->erspan_ver == 2) && + !p->collect_md) + o_flags |= TUNNEL_KEY; if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, gre_tnl_flags_to_gre_flags(p->i_flags)) || nla_put_be16(skb, IFLA_GRE_OFLAGS, - gre_tnl_flags_to_gre_flags(p->o_flags)) || + gre_tnl_flags_to_gre_flags(o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) || -- cgit v1.2.3 From ef489749aae508e6f17886775c075f12ff919fb1 Mon Sep 17 00:00:00 2001 From: Yohei Kanemaru Date: Tue, 29 Jan 2019 15:52:34 +0900 Subject: ipv6: sr: clear IP6CB(skb) on SRH ip4ip6 encapsulation skb->cb may contain data from previous layers (in an observed case IPv4 with L3 Master Device). In the observed scenario, the data in IPCB(skb)->frags was misinterpreted as IP6CB(skb)->frag_max_size, eventually caused an unexpected IPv6 fragmentation in ip6_fragment() through ip6_finish_output(). This patch clears IP6CB(skb), which potentially contains garbage data, on the SRH ip4ip6 encapsulation. Fixes: 32d99d0b6702 ("ipv6: sr: add support for ip4ip6 encapsulation") Signed-off-by: Yohei Kanemaru Signed-off-by: David S. Miller --- net/ipv6/seg6_iptunnel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 8181ee7e1e27..ee5403cbe655 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -146,6 +146,8 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) } else { ip6_flow_hdr(hdr, 0, flowlabel); hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb)); + + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); } hdr->nexthdr = NEXTHDR_ROUTING; -- cgit v1.2.3 From 4522a70db7aa5e77526a4079628578599821b193 Mon Sep 17 00:00:00 2001 From: Jacob Wen Date: Wed, 30 Jan 2019 14:55:14 +0800 Subject: l2tp: fix reading optional fields of L2TPv3 Use pskb_may_pull() to make sure the optional fields are in skb linear parts, so we can safely read them later. It's easy to reproduce the issue with a net driver that supports paged skb data. Just create a L2TPv3 over IP tunnel and then generates some network traffic. Once reproduced, rx err in /sys/kernel/debug/l2tp/tunnels will increase. Changes in v4: 1. s/l2tp_v3_pull_opt/l2tp_v3_ensure_opt_in_linear/ 2. s/tunnel->version != L2TP_HDR_VER_2/tunnel->version == L2TP_HDR_VER_3/ 3. Add 'Fixes' in commit messages. Changes in v3: 1. To keep consistency, move the code out of l2tp_recv_common. 2. Use "net" instead of "net-next", since this is a bug fix. Changes in v2: 1. Only fix L2TPv3 to make code simple. To fix both L2TPv3 and L2TPv2, we'd better refactor l2tp_recv_common. It's complicated to do so. 2. Reloading pointers after pskb_may_pull Fixes: f7faffa3ff8e ("l2tp: Add L2TPv3 protocol support") Fixes: 0d76751fad77 ("l2tp: Add L2TPv3 IP encapsulation (no UDP) support") Fixes: a32e0eec7042 ("l2tp: introduce L2TPv3 IP encapsulation support for IPv6") Signed-off-by: Jacob Wen Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 4 ++++ net/l2tp/l2tp_core.h | 20 ++++++++++++++++++++ net/l2tp/l2tp_ip.c | 3 +++ net/l2tp/l2tp_ip6.c | 3 +++ 4 files changed, 30 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 26f1d435696a..dd5ba0c11ab3 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -884,6 +884,10 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) goto error; } + if (tunnel->version == L2TP_HDR_VER_3 && + l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) + goto error; + l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); l2tp_session_dec_refcount(session); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 9c9afe94d389..b2ce90260c35 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -301,6 +301,26 @@ static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel) } #endif +static inline int l2tp_v3_ensure_opt_in_linear(struct l2tp_session *session, struct sk_buff *skb, + unsigned char **ptr, unsigned char **optr) +{ + int opt_len = session->peer_cookie_len + l2tp_get_l2specific_len(session); + + if (opt_len > 0) { + int off = *ptr - *optr; + + if (!pskb_may_pull(skb, off + opt_len)) + return -1; + + if (skb->data != *optr) { + *optr = skb->data; + *ptr = skb->data + off; + } + } + + return 0; +} + #define l2tp_printk(ptr, type, func, fmt, ...) \ do { \ if (((ptr)->debug) & (type)) \ diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 35f6f86d4dcc..d4c60523c549 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -165,6 +165,9 @@ static int l2tp_ip_recv(struct sk_buff *skb) print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); } + if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) + goto discard_sess; + l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_dec_refcount(session); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 237f1a4a0b0c..0ae6899edac0 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -178,6 +178,9 @@ static int l2tp_ip6_recv(struct sk_buff *skb) print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length); } + if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) + goto discard_sess; + l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_dec_refcount(session); -- cgit v1.2.3 From 91c524708de6207f59dd3512518d8a1c7b434ee3 Mon Sep 17 00:00:00 2001 From: Jacob Wen Date: Thu, 31 Jan 2019 15:18:56 +0800 Subject: l2tp: copy 4 more bytes to linear part if necessary The size of L2TPv2 header with all optional fields is 14 bytes. l2tp_udp_recv_core only moves 10 bytes to the linear part of a skb. This may lead to l2tp_recv_common read data outside of a skb. This patch make sure that there is at least 14 bytes in the linear part of a skb to meet the maximum need of l2tp_udp_recv_core and l2tp_recv_common. The minimum size of both PPP HDLC-like frame and Ethernet frame is larger than 14 bytes, so we are safe to do so. Also remove L2TP_HDR_SIZE_NOSEQ, it is unused now. Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts") Suggested-by: Guillaume Nault Signed-off-by: Jacob Wen Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index dd5ba0c11ab3..fed6becc5daf 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -83,8 +83,7 @@ #define L2TP_SLFLAG_S 0x40000000 #define L2TP_SL_SEQ_MASK 0x00ffffff -#define L2TP_HDR_SIZE_SEQ 10 -#define L2TP_HDR_SIZE_NOSEQ 6 +#define L2TP_HDR_SIZE_MAX 14 /* Default trace flags */ #define L2TP_DEFAULT_DEBUG_FLAGS 0 @@ -808,7 +807,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) __skb_pull(skb, sizeof(struct udphdr)); /* Short packet? */ - if (!pskb_may_pull(skb, L2TP_HDR_SIZE_SEQ)) { + if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) { l2tp_info(tunnel, L2TP_MSG_DATA, "%s: recv short packet (len=%d)\n", tunnel->name, skb->len); -- cgit v1.2.3 From 6fa19f5637a6c22bc0999596bcc83bdcac8a4fa6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 31 Jan 2019 08:47:10 -0800 Subject: rds: fix refcount bug in rds_sock_addref syzbot was able to catch a bug in rds [1] The issue here is that the socket might be found in a hash table but that its refcount has already be set to 0 by another cpu. We need to use refcount_inc_not_zero() to be safe here. [1] refcount_t: increment on 0; use-after-free. WARNING: CPU: 1 PID: 23129 at lib/refcount.c:153 refcount_inc_checked lib/refcount.c:153 [inline] WARNING: CPU: 1 PID: 23129 at lib/refcount.c:153 refcount_inc_checked+0x61/0x70 lib/refcount.c:151 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 23129 Comm: syz-executor3 Not tainted 5.0.0-rc4+ #53 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1db/0x2d0 lib/dump_stack.c:113 panic+0x2cb/0x65c kernel/panic.c:214 __warn.cold+0x20/0x48 kernel/panic.c:571 report_bug+0x263/0x2b0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] fixup_bug arch/x86/kernel/traps.c:173 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271 do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:290 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973 RIP: 0010:refcount_inc_checked lib/refcount.c:153 [inline] RIP: 0010:refcount_inc_checked+0x61/0x70 lib/refcount.c:151 Code: 1d 51 63 c8 06 31 ff 89 de e8 eb 1b f2 fd 84 db 75 dd e8 a2 1a f2 fd 48 c7 c7 60 9f 81 88 c6 05 31 63 c8 06 01 e8 af 65 bb fd <0f> 0b eb c1 90 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 41 54 49 RSP: 0018:ffff8880a0cbf1e8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffc90006113000 RDX: 000000000001047d RSI: ffffffff81685776 RDI: 0000000000000005 RBP: ffff8880a0cbf1f8 R08: ffff888097c9e100 R09: ffffed1015ce5021 R10: ffffed1015ce5020 R11: ffff8880ae728107 R12: ffff8880723c20c0 R13: ffff8880723c24b0 R14: dffffc0000000000 R15: ffffed1014197e64 sock_hold include/net/sock.h:647 [inline] rds_sock_addref+0x19/0x20 net/rds/af_rds.c:675 rds_find_bound+0x97c/0x1080 net/rds/bind.c:82 rds_recv_incoming+0x3be/0x1430 net/rds/recv.c:362 rds_loop_xmit+0xf3/0x2a0 net/rds/loop.c:96 rds_send_xmit+0x1355/0x2a10 net/rds/send.c:355 rds_sendmsg+0x323c/0x44e0 net/rds/send.c:1368 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg+0xdd/0x130 net/socket.c:631 __sys_sendto+0x387/0x5f0 net/socket.c:1788 __do_sys_sendto net/socket.c:1800 [inline] __se_sys_sendto net/socket.c:1796 [inline] __x64_sys_sendto+0xe1/0x1a0 net/socket.c:1796 do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x458089 Code: 6d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 3b b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fc266df8c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 0000000000458089 RDX: 0000000000000000 RSI: 00000000204b3fff RDI: 0000000000000005 RBP: 000000000073bf00 R08: 00000000202b4000 R09: 0000000000000010 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fc266df96d4 R13: 00000000004c56e4 R14: 00000000004d94a8 R15: 00000000ffffffff Fixes: cc4dfb7f70a3 ("rds: fix two RCU related problems") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Sowmini Varadhan Cc: Santosh Shilimkar Cc: rds-devel@oss.oracle.com Cc: Cong Wang Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/bind.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/bind.c b/net/rds/bind.c index 762d2c6788a3..17c9d9f0c848 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -78,10 +78,10 @@ struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __rds_create_bind_key(key, addr, port, scope_id); rcu_read_lock(); rs = rhashtable_lookup(&bind_hash_table, key, ht_parms); - if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) - rds_sock_addref(rs); - else + if (rs && (sock_flag(rds_rs_to_sk(rs), SOCK_DEAD) || + !refcount_inc_not_zero(&rds_rs_to_sk(rs)->sk_refcnt))) rs = NULL; + rcu_read_unlock(); rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr, -- cgit v1.2.3 From 9d0f50b80222dc273e67e4e14410fcfa4130a90c Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 29 Jan 2019 11:10:57 +0100 Subject: mac80211: ensure that mgmt tx skbs have tailroom for encryption Some drivers use IEEE80211_KEY_FLAG_SW_MGMT_TX to indicate that management frames need to be software encrypted. Since normal data packets are still encrypted by the hardware, crypto_tx_tailroom_needed_cnt gets decremented after key upload to hw. This can lead to passing skbs to ccmp_encrypt_skb, which don't have the necessary tailroom for software encryption. Change the code to add tailroom for encrypted management packets, even if crypto_tx_tailroom_needed_cnt is 0. Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index f170d6c6629a..928f13a208b0 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1938,9 +1938,16 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, int head_need, bool may_encrypt) { struct ieee80211_local *local = sdata->local; + struct ieee80211_hdr *hdr; + bool enc_tailroom; int tail_need = 0; - if (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt) { + hdr = (struct ieee80211_hdr *) skb->data; + enc_tailroom = may_encrypt && + (sdata->crypto_tx_tailroom_needed_cnt || + ieee80211_is_mgmt(hdr->frame_control)); + + if (enc_tailroom) { tail_need = IEEE80211_ENCRYPT_TAILROOM; tail_need -= skb_tailroom(skb); tail_need = max_t(int, tail_need, 0); @@ -1948,8 +1955,7 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, if (skb_cloned(skb) && (!ieee80211_hw_check(&local->hw, SUPPORTS_CLONED_SKBS) || - !skb_clone_writable(skb, ETH_HLEN) || - (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt))) + !skb_clone_writable(skb, ETH_HLEN) || enc_tailroom)) I802_DEBUG_INC(local->tx_expand_skb_head_cloned); else if (head_need || tail_need) I802_DEBUG_INC(local->tx_expand_skb_head); -- cgit v1.2.3 From e005bd7ddea06784c1eb91ac5bb6b171a94f3b05 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 1 Feb 2019 11:09:54 +0100 Subject: cfg80211: call disconnect_wk when AP stops Since we now prevent regulatory restore during STA disconnect if concurrent AP interfaces are active, we need to reschedule this check when the AP state changes. This fixes never doing a restore when an AP is the last interface to stop. Or to put it another way: we need to re-check after anything we check here changes. Cc: stable@vger.kernel.org Fixes: 113f3aaa81bd ("cfg80211: Prevent regulatory restore during STA disconnect in concurrent interfaces") Signed-off-by: Johannes Berg --- net/wireless/ap.c | 2 ++ net/wireless/core.h | 2 ++ net/wireless/sme.c | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/ap.c b/net/wireless/ap.c index 882d97bdc6bf..550ac9d827fe 100644 --- a/net/wireless/ap.c +++ b/net/wireless/ap.c @@ -41,6 +41,8 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, cfg80211_sched_dfs_chan_update(rdev); } + schedule_work(&cfg80211_disconnect_work); + return err; } diff --git a/net/wireless/core.h b/net/wireless/core.h index c5d6f3418601..f6b40563dc63 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -445,6 +445,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, u32 center_freq_khz, u32 bw_khz); +extern struct work_struct cfg80211_disconnect_work; + /** * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable * @wiphy: the wiphy to validate against diff --git a/net/wireless/sme.c b/net/wireless/sme.c index f741d8376a46..7d34cb884840 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -667,7 +667,7 @@ static void disconnect_work(struct work_struct *work) rtnl_unlock(); } -static DECLARE_WORK(cfg80211_disconnect_work, disconnect_work); +DECLARE_WORK(cfg80211_disconnect_work, disconnect_work); /* -- cgit v1.2.3 From ba59fb0273076637f0add4311faa990a5eec27c0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 1 Feb 2019 15:15:22 +0100 Subject: sctp: walk the list of asoc safely In sctp_sendmesg(), when walking the list of endpoint associations, the association can be dropped from the list, making the list corrupt. Properly handle this by using list_for_each_entry_safe() Fixes: 4910280503f3 ("sctp: add support for snd flag SCTP_SENDALL process in sendmsg") Reported-by: Secunia Research Tested-by: Secunia Research Signed-off-by: Greg Kroah-Hartman Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f93c3cf9e567..65d6d04546ae 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2027,7 +2027,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_transport *transport = NULL; struct sctp_sndrcvinfo _sinfo, *sinfo; - struct sctp_association *asoc; + struct sctp_association *asoc, *tmp; struct sctp_cmsgs cmsgs; union sctp_addr *daddr; bool new = false; @@ -2053,7 +2053,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) /* SCTP_SENDALL process */ if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP)) { - list_for_each_entry(asoc, &ep->asocs, asocs) { + list_for_each_entry_safe(asoc, tmp, &ep->asocs, asocs) { err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len); if (err == 0) -- cgit v1.2.3 From 14d22d4d61e40623a7c5816728bfe55c322e779a Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 30 Jan 2019 18:51:00 +0100 Subject: net/smc: fix another sizeof to int comparison Comparing an int to a size, which is unsigned, causes the int to become unsigned, giving the wrong result. kernel_sendmsg can return a negative error code. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 776e9dfc915d..d53fd588d1f5 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -378,7 +378,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) vec.iov_len = sizeof(struct smc_clc_msg_decline); len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(struct smc_clc_msg_decline)); - if (len < sizeof(struct smc_clc_msg_decline)) + if (len < 0 || len < sizeof(struct smc_clc_msg_decline)) len = -EPROTO; return len > 0 ? 0 : len; } -- cgit v1.2.3 From ca8dc1334a71d6081b09e18d55198a27e28fd44b Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Wed, 30 Jan 2019 18:51:01 +0100 Subject: net/smc: allow 16 byte pnetids in netlink policy Currently, users can only send pnetids with a maximum length of 15 bytes over the SMC netlink interface although the maximum pnetid length is 16 bytes. This patch changes the SMC netlink policy to accept 16 byte pnetids. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 7cb3e4f07c10..632c3109dee5 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -27,7 +27,7 @@ static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, - .len = SMC_MAX_PNETID_LEN - 1 + .len = SMC_MAX_PNETID_LEN }, [SMC_PNETID_ETHNAME] = { .type = NLA_NUL_STRING, -- cgit v1.2.3 From 77f838ace755d2f466536c44dac6c856f62cd901 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 30 Jan 2019 18:51:02 +0100 Subject: net/smc: prevent races between smc_lgr_terminate() and smc_conn_free() To prevent races between smc_lgr_terminate() and smc_conn_free() add an extra check of the lgr field before accessing it, and cancel a delayed free_work when a new smc connection is created. This fixes the problem that free_work cleared the lgr variable but smc_lgr_terminate() or smc_conn_free() still access it in parallel. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 35c1cdc93e1c..097c798983ca 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -128,6 +128,8 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!lgr) + return; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { __smc_lgr_unregister_conn(conn); @@ -628,6 +630,8 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, local_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; smc_lgr_register_conn(conn); /* add smc conn to lgr */ + if (delayed_work_pending(&lgr->free_work)) + cancel_delayed_work(&lgr->free_work); write_unlock_bh(&lgr->conns_lock); break; } -- cgit v1.2.3 From 6889b36da78a21a312d8b462c1fa25a03c2ff192 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 30 Jan 2019 18:51:03 +0100 Subject: net/smc: don't wait for send buffer space when data was already sent When there is no more send buffer space and at least 1 byte was already sent then return to user space. The wait is only done when no data was sent by the sendmsg() call. This fixes smc_tx_sendmsg() which tried to always send all user data and started to wait for free send buffer space when needed. During this wait the user space program was blocked in the sendmsg() call and hence not able to receive incoming data. When both sides were in such a situation then the connection stalled forever. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_tx.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index d8366ed51757..f99951f3f7fd 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -165,12 +165,11 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { + if (send_done) + return send_done; rc = smc_tx_wait(smc, msg->msg_flags); - if (rc) { - if (send_done) - return send_done; + if (rc) goto out_err; - } continue; } -- cgit v1.2.3 From 51c5aba3b672c4285fca052817f34b22dc79dda7 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 30 Jan 2019 18:51:04 +0100 Subject: net/smc: recvmsg and splice_read should return 0 after shutdown When a socket was connected and is now shut down for read, return 0 to indicate end of data in recvmsg and splice_read (like TCP) and do not return ENOTCONN. This behavior is required by the socket api. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c4e56602e0c6..b04a813fc865 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1505,6 +1505,11 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, smc = smc_sk(sk); lock_sock(sk); + if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { + /* socket was connected before, no more data to read */ + rc = 0; + goto out; + } if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) || (sk->sk_state == SMC_CLOSED)) @@ -1840,7 +1845,11 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, smc = smc_sk(sk); lock_sock(sk); - + if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { + /* socket was connected before, no more data to read */ + rc = 0; + goto out; + } if (sk->sk_state == SMC_INIT || sk->sk_state == SMC_LISTEN || sk->sk_state == SMC_CLOSED) -- cgit v1.2.3 From 33f3fcc290671590821ff3c0c9396db1ec9b7d4c Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 30 Jan 2019 18:51:05 +0100 Subject: net/smc: do not wait under send_lock smc_cdc_get_free_slot() might wait for free transfer buffers when using SMC-R. This wait should not be done under the send_lock, which is a spin_lock. This fixes a cpu loop in parallel threads waiting for the send_lock. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_tx.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index f99951f3f7fd..36af3de731b9 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -488,25 +488,23 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) struct smc_wr_buf *wr_buf; int rc; - spin_lock_bh(&conn->send_lock); rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); if (rc < 0) { if (rc == -EBUSY) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); - if (smc->sk.sk_err == ECONNABORTED) { - rc = sock_error(&smc->sk); - goto out_unlock; - } + if (smc->sk.sk_err == ECONNABORTED) + return sock_error(&smc->sk); rc = 0; if (conn->alert_token_local) /* connection healthy */ mod_delayed_work(system_wq, &conn->tx_work, SMC_TX_WORK_DELAY); } - goto out_unlock; + return rc; } + spin_lock_bh(&conn->send_lock); if (!conn->local_tx_ctrl.prod_flags.urg_data_present) { rc = smc_tx_rdma_writes(conn); if (rc) { -- cgit v1.2.3 From 2dee25af42f99b476d5916aeac5c994e4dcc910b Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 30 Jan 2019 18:51:06 +0100 Subject: net/smc: call smc_cdc_msg_send() under send_lock Call smc_cdc_msg_send() under the connection send_lock to make sure all send operations for one connection are serialized. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index db83332ac1c8..1c5333d494e9 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -125,7 +125,10 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) if (rc) return rc; - return smc_cdc_msg_send(conn, wr_buf, pend); + spin_lock_bh(&conn->send_lock); + rc = smc_cdc_msg_send(conn, wr_buf, pend); + spin_unlock_bh(&conn->send_lock); + return rc; } int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) -- cgit v1.2.3 From e5f3aa04dbfd92154f21edfeb7a1676a45918928 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 30 Jan 2019 18:51:07 +0100 Subject: net/smc: use device link provided in qp_context The device field of the IB event structure does not always point to the SMC IB device. Load the pointer from the qp_context which is always provided to smc_ib_qp_event_handler() in the priv field. And for qp events the affected port is given in the qp structure of the ibevent, derive it from there. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index e519ef29c0ff..76487a16934e 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -289,8 +289,8 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { - struct smc_ib_device *smcibdev = - (struct smc_ib_device *)ibevent->device; + struct smc_link *lnk = (struct smc_link *)priv; + struct smc_ib_device *smcibdev = lnk->smcibdev; u8 port_idx; switch (ibevent->event) { @@ -298,7 +298,7 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) case IB_EVENT_GID_CHANGE: case IB_EVENT_PORT_ERR: case IB_EVENT_QP_ACCESS_ERR: - port_idx = ibevent->element.port_num - 1; + port_idx = ibevent->element.qp->port - 1; set_bit(port_idx, &smcibdev->port_event_mask); schedule_work(&smcibdev->port_event_work); break; -- cgit v1.2.3 From 46ad02229d6b4ee276eb295ca08f039993f323c8 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 30 Jan 2019 18:51:08 +0100 Subject: net/smc: fix use of variable in cleared area Do not use pend->idx as index for the arrays because its value is located in the cleared area. Use the existing local variable instead. Without this fix the wrong area might be cleared. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_wr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index c2694750a6a8..1dc88c32d6bb 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -218,10 +218,10 @@ int smc_wr_tx_put_slot(struct smc_link *link, u32 idx = pend->idx; /* clear the full struct smc_wr_tx_pend including .priv */ - memset(&link->wr_tx_pends[pend->idx], 0, - sizeof(link->wr_tx_pends[pend->idx])); - memset(&link->wr_tx_bufs[pend->idx], 0, - sizeof(link->wr_tx_bufs[pend->idx])); + memset(&link->wr_tx_pends[idx], 0, + sizeof(link->wr_tx_pends[idx])); + memset(&link->wr_tx_bufs[idx], 0, + sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); return 1; } -- cgit v1.2.3 From 9b1f19d810e92d6cdc68455fbc22d9f961a58ce1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Jan 2019 11:39:41 -0800 Subject: dccp: fool proof ccid_hc_[rt]x_parse_options() Similarly to commit 276bdb82dedb ("dccp: check ccid before dereferencing") it is wise to test for a NULL ccid. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 5.0.0-rc3+ #37 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:ccid_hc_tx_parse_options net/dccp/ccid.h:205 [inline] RIP: 0010:dccp_parse_options+0x8d9/0x12b0 net/dccp/options.c:233 Code: c5 0f b6 75 b3 80 38 00 0f 85 d6 08 00 00 48 b9 00 00 00 00 00 fc ff df 48 8b 45 b8 4c 8b b8 f8 07 00 00 4c 89 f8 48 c1 e8 03 <80> 3c 08 00 0f 85 95 08 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8b kobject: 'loop5' (0000000080f78fc1): kobject_uevent_env RSP: 0018:ffff8880a94df0b8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8880858ac723 RCX: dffffc0000000000 RDX: 0000000000000100 RSI: 0000000000000007 RDI: 0000000000000001 RBP: ffff8880a94df140 R08: 0000000000000001 R09: ffff888061b83a80 R10: ffffed100c370752 R11: ffff888061b83a97 R12: 0000000000000026 R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f0defa33518 CR3: 000000008db5e000 CR4: 00000000001406e0 kobject: 'loop5' (0000000080f78fc1): fill_kobj_path: path = '/devices/virtual/block/loop5' DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: dccp_rcv_state_process+0x2b6/0x1af6 net/dccp/input.c:654 dccp_v4_do_rcv+0x100/0x190 net/dccp/ipv4.c:688 sk_backlog_rcv include/net/sock.h:936 [inline] __sk_receive_skb+0x3a9/0xea0 net/core/sock.c:473 dccp_v4_rcv+0x10cb/0x1f80 net/dccp/ipv4.c:880 ip_protocol_deliver_rcu+0xb6/0xa20 net/ipv4/ip_input.c:208 ip_local_deliver_finish+0x23b/0x390 net/ipv4/ip_input.c:234 NF_HOOK include/linux/netfilter.h:289 [inline] NF_HOOK include/linux/netfilter.h:283 [inline] ip_local_deliver+0x1f0/0x740 net/ipv4/ip_input.c:255 dst_input include/net/dst.h:450 [inline] ip_rcv_finish+0x1f4/0x2f0 net/ipv4/ip_input.c:414 NF_HOOK include/linux/netfilter.h:289 [inline] NF_HOOK include/linux/netfilter.h:283 [inline] ip_rcv+0xed/0x620 net/ipv4/ip_input.c:524 __netif_receive_skb_one_core+0x160/0x210 net/core/dev.c:4973 __netif_receive_skb+0x2c/0x1c0 net/core/dev.c:5083 process_backlog+0x206/0x750 net/core/dev.c:5923 napi_poll net/core/dev.c:6346 [inline] net_rx_action+0x76d/0x1930 net/core/dev.c:6412 __do_softirq+0x30b/0xb11 kernel/softirq.c:292 run_ksoftirqd kernel/softirq.c:654 [inline] run_ksoftirqd+0x8e/0x110 kernel/softirq.c:646 smpboot_thread_fn+0x6ab/0xa10 kernel/smpboot.c:164 kthread+0x357/0x430 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 Modules linked in: ---[ end trace 58a0ba03bea2c376 ]--- RIP: 0010:ccid_hc_tx_parse_options net/dccp/ccid.h:205 [inline] RIP: 0010:dccp_parse_options+0x8d9/0x12b0 net/dccp/options.c:233 Code: c5 0f b6 75 b3 80 38 00 0f 85 d6 08 00 00 48 b9 00 00 00 00 00 fc ff df 48 8b 45 b8 4c 8b b8 f8 07 00 00 4c 89 f8 48 c1 e8 03 <80> 3c 08 00 0f 85 95 08 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8b RSP: 0018:ffff8880a94df0b8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8880858ac723 RCX: dffffc0000000000 RDX: 0000000000000100 RSI: 0000000000000007 RDI: 0000000000000001 RBP: ffff8880a94df140 R08: 0000000000000001 R09: ffff888061b83a80 R10: ffffed100c370752 R11: ffff888061b83a97 R12: 0000000000000026 R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f0defa33518 CR3: 0000000009871000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccid.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 6eb837a47b5c..baaaeb2b2c42 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -202,7 +202,7 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len) { - if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) + if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options) return 0; return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); } @@ -214,7 +214,7 @@ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len) { - if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) + if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options) return 0; return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); } -- cgit v1.2.3 From 22b5c0b63f32568e130fa2df4ba23efce3eb495b Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 1 Feb 2019 12:42:06 +0100 Subject: vsock/virtio: fix kernel panic after device hot-unplug virtio_vsock_remove() invokes the vsock_core_exit() also if there are opened sockets for the AF_VSOCK protocol family. In this way the vsock "transport" pointer is set to NULL, triggering the kernel panic at the first socket activity. This patch move the vsock_core_init()/vsock_core_exit() in the virtio_vsock respectively in module_init and module_exit functions, that cannot be invoked until there are open sockets. Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1609699 Reported-by: Yan Fu Signed-off-by: Stefano Garzarella Acked-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 5d3cce9e8744..9dae54698737 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -75,6 +75,9 @@ static u32 virtio_transport_get_local_cid(void) { struct virtio_vsock *vsock = virtio_vsock_get(); + if (!vsock) + return VMADDR_CID_ANY; + return vsock->guest_cid; } @@ -584,10 +587,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev) virtio_vsock_update_guest_cid(vsock); - ret = vsock_core_init(&virtio_transport.transport); - if (ret < 0) - goto out_vqs; - vsock->rx_buf_nr = 0; vsock->rx_buf_max_nr = 0; atomic_set(&vsock->queued_replies, 0); @@ -618,8 +617,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev) mutex_unlock(&the_virtio_vsock_mutex); return 0; -out_vqs: - vsock->vdev->config->del_vqs(vsock->vdev); out: kfree(vsock); mutex_unlock(&the_virtio_vsock_mutex); @@ -669,7 +666,6 @@ static void virtio_vsock_remove(struct virtio_device *vdev) mutex_lock(&the_virtio_vsock_mutex); the_virtio_vsock = NULL; - vsock_core_exit(); mutex_unlock(&the_virtio_vsock_mutex); vdev->config->del_vqs(vdev); @@ -702,14 +698,28 @@ static int __init virtio_vsock_init(void) virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0); if (!virtio_vsock_workqueue) return -ENOMEM; + ret = register_virtio_driver(&virtio_vsock_driver); if (ret) - destroy_workqueue(virtio_vsock_workqueue); + goto out_wq; + + ret = vsock_core_init(&virtio_transport.transport); + if (ret) + goto out_vdr; + + return 0; + +out_vdr: + unregister_virtio_driver(&virtio_vsock_driver); +out_wq: + destroy_workqueue(virtio_vsock_workqueue); return ret; + } static void __exit virtio_vsock_exit(void) { + vsock_core_exit(); unregister_virtio_driver(&virtio_vsock_driver); destroy_workqueue(virtio_vsock_workqueue); } -- cgit v1.2.3 From 85965487abc540368393a15491e6e7fcd230039d Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 1 Feb 2019 12:42:07 +0100 Subject: vsock/virtio: reset connected sockets on device removal When the virtio transport device disappear, we should reset all connected sockets in order to inform the users. Signed-off-by: Stefano Garzarella Reviewed-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 9dae54698737..15eb5d3d4750 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -634,6 +634,9 @@ static void virtio_vsock_remove(struct virtio_device *vdev) flush_work(&vsock->event_work); flush_work(&vsock->send_pkt_work); + /* Reset all connected sockets when the device disappear */ + vsock_for_each_connected_socket(virtio_vsock_reset_sock); + vdev->config->reset(vdev); mutex_lock(&vsock->rx_lock); -- cgit v1.2.3 From cfe4bd7a257f6d6f81d3458d8c9d9ec4957539e6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 4 Feb 2019 03:27:58 +0800 Subject: sctp: check and update stream->out_curr when allocating stream_out Now when using stream reconfig to add out streams, stream->out will get re-allocated, and all old streams' information will be copied to the new ones and the old ones will be freed. So without stream->out_curr updated, next time when trying to send from stream->out_curr stream, a panic would be caused. This patch is to check and update stream->out_curr when allocating stream_out. v1->v2: - define fa_index() to get elem index from stream->out_curr. v2->v3: - repost with no change. Fixes: 5bbbbe32a431 ("sctp: introduce stream scheduler foundations") Reported-by: Ying Xu Reported-by: syzbot+e33a3a138267ca119c7d@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/stream.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 80e0ae5534ec..f24633114dfd 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -84,6 +84,19 @@ static void fa_zero(struct flex_array *fa, size_t index, size_t count) } } +static size_t fa_index(struct flex_array *fa, void *elem, size_t count) +{ + size_t index = 0; + + while (count--) { + if (elem == flex_array_get(fa, index)) + break; + index++; + } + + return index; +} + /* Migrates chunks from stream queues to new stream queues if needed, * but not across associations. Also, removes those chunks to streams * higher than the new max. @@ -147,6 +160,13 @@ static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, if (stream->out) { fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt)); + if (stream->out_curr) { + size_t index = fa_index(stream->out, stream->out_curr, + stream->outcnt); + + BUG_ON(index == stream->outcnt); + stream->out_curr = flex_array_get(out, index); + } fa_free(stream->out); } -- cgit v1.2.3 From 4e35c1cb9460240e983a01745b5f29fe3a4d8e39 Mon Sep 17 00:00:00 2001 From: Martynas Pumputis Date: Tue, 29 Jan 2019 15:51:42 +0100 Subject: netfilter: nf_nat: skip nat clash resolution for same-origin entries It is possible that two concurrent packets originating from the same socket of a connection-less protocol (e.g. UDP) can end up having different IP_CT_DIR_REPLY tuples which results in one of the packets being dropped. To illustrate this, consider the following simplified scenario: 1. Packet A and B are sent at the same time from two different threads by same UDP socket. No matching conntrack entry exists yet. Both packets cause allocation of a new conntrack entry. 2. get_unique_tuple gets called for A. No clashing entry found. conntrack entry for A is added to main conntrack table. 3. get_unique_tuple is called for B and will find that the reply tuple of B is already taken by A. It will allocate a new UDP source port for B to resolve the clash. 4. conntrack entry for B cannot be added to main conntrack table because its ORIGINAL direction is clashing with A and the REPLY directions of A and B are not the same anymore due to UDP source port reallocation done in step 3. This patch modifies nf_conntrack_tuple_taken so it doesn't consider colliding reply tuples if the IP_CT_DIR_ORIGINAL tuples are equal. [ Florian: simplify patch to not use .allow_clash setting and always ignore identical flows ] Signed-off-by: Martynas Pumputis Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 741b533148ba..db4d46332e86 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1007,6 +1007,22 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, } if (nf_ct_key_equal(h, tuple, zone, net)) { + /* Tuple is taken already, so caller will need to find + * a new source port to use. + * + * Only exception: + * If the *original tuples* are identical, then both + * conntracks refer to the same flow. + * This is a rare situation, it can occur e.g. when + * more than one UDP packet is sent from same socket + * in different threads. + * + * Let nf_ct_resolve_clash() deal with this later. + */ + if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) + continue; + NF_CT_STAT_INC_ATOMIC(net, found); rcu_read_unlock(); return 1; -- cgit v1.2.3 From f6ac8585897684374a19863fff21186a05805286 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 2 Feb 2019 10:49:13 +0100 Subject: netfilter: nf_tables: unbind set in rule from commit path Anonymous sets that are bound to rules from the same transaction trigger a kernel splat from the abort path due to double set list removal and double free. This patch updates the logic to search for the transaction that is responsible for creating the set and disable the set list removal and release, given the rule is now responsible for this. Lookup is reverse since the transaction that adds the set is likely to be at the tail of the list. Moreover, this patch adds the unbind step to deliver the event from the commit path. This should not be done from the worker thread, since we have no guarantees of in-order delivery to the listener. This patch removes the assumption that both activate and deactivate callbacks need to be provided. Fixes: cd5125d8f518 ("netfilter: nf_tables: split set destruction in deactivate and destroy phase") Reported-by: Mikhail Morfikov Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 17 ++++++-- net/netfilter/nf_tables_api.c | 85 +++++++++++++++++++-------------------- net/netfilter/nft_compat.c | 6 ++- net/netfilter/nft_dynset.c | 18 ++++----- net/netfilter/nft_immediate.c | 6 ++- net/netfilter/nft_lookup.c | 18 ++++----- net/netfilter/nft_objref.c | 18 ++++----- 7 files changed, 85 insertions(+), 83 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 841835a387e1..b4984bbbe157 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -469,9 +469,7 @@ struct nft_set_binding { int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding); void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_binding *binding); -void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_binding *binding); + struct nft_set_binding *binding, bool commit); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); /** @@ -721,6 +719,13 @@ struct nft_expr_type { #define NFT_EXPR_STATEFUL 0x1 #define NFT_EXPR_GC 0x2 +enum nft_trans_phase { + NFT_TRANS_PREPARE, + NFT_TRANS_ABORT, + NFT_TRANS_COMMIT, + NFT_TRANS_RELEASE +}; + /** * struct nft_expr_ops - nf_tables expression operations * @@ -750,7 +755,8 @@ struct nft_expr_ops { void (*activate)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*deactivate)(const struct nft_ctx *ctx, - const struct nft_expr *expr); + const struct nft_expr *expr, + enum nft_trans_phase phase); void (*destroy)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*destroy_clone)(const struct nft_ctx *ctx, @@ -1323,12 +1329,15 @@ struct nft_trans_rule { struct nft_trans_set { struct nft_set *set; u32 set_id; + bool bound; }; #define nft_trans_set(trans) \ (((struct nft_trans_set *)trans->data)->set) #define nft_trans_set_id(trans) \ (((struct nft_trans_set *)trans->data)->set_id) +#define nft_trans_set_bound(trans) \ + (((struct nft_trans_set *)trans->data)->bound) struct nft_trans_chain { bool update; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fb07f6cfc719..5a92f23f179f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -116,6 +116,23 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } +static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct net *net = ctx->net; + struct nft_trans *trans; + + if (!nft_set_is_anonymous(set)) + return; + + list_for_each_entry_reverse(trans, &net->nft.commit_list, list) { + if (trans->msg_type == NFT_MSG_NEWSET && + nft_trans_set(trans) == set) { + nft_trans_set_bound(trans) = true; + break; + } + } +} + static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) @@ -211,18 +228,6 @@ static int nft_delchain(struct nft_ctx *ctx) return err; } -/* either expr ops provide both activate/deactivate, or neither */ -static bool nft_expr_check_ops(const struct nft_expr_ops *ops) -{ - if (!ops) - return true; - - if (WARN_ON_ONCE((!ops->activate ^ !ops->deactivate))) - return false; - - return true; -} - static void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule) { @@ -238,14 +243,15 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx, } static void nft_rule_expr_deactivate(const struct nft_ctx *ctx, - struct nft_rule *rule) + struct nft_rule *rule, + enum nft_trans_phase phase) { struct nft_expr *expr; expr = nft_expr_first(rule); while (expr != nft_expr_last(rule) && expr->ops) { if (expr->ops->deactivate) - expr->ops->deactivate(ctx, expr); + expr->ops->deactivate(ctx, expr, phase); expr = nft_expr_next(expr); } @@ -296,7 +302,7 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) nft_trans_destroy(trans); return err; } - nft_rule_expr_deactivate(ctx, rule); + nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_PREPARE); return 0; } @@ -1929,9 +1935,6 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, */ int nft_register_expr(struct nft_expr_type *type) { - if (!nft_expr_check_ops(type->ops)) - return -EINVAL; - nfnl_lock(NFNL_SUBSYS_NFTABLES); if (type->family == NFPROTO_UNSPEC) list_add_tail_rcu(&type->list, &nf_tables_expressions); @@ -2079,10 +2082,6 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, err = PTR_ERR(ops); goto err1; } - if (!nft_expr_check_ops(ops)) { - err = -EINVAL; - goto err1; - } } else ops = type->ops; @@ -2511,7 +2510,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { - nft_rule_expr_deactivate(ctx, rule); + nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); } @@ -3708,39 +3707,30 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, bind: binding->chain = ctx->chain; list_add_tail_rcu(&binding->list, &set->bindings); + nft_set_trans_bind(ctx, set); + return 0; } EXPORT_SYMBOL_GPL(nf_tables_bind_set); -void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_binding *binding) -{ - if (list_empty(&set->bindings) && nft_set_is_anonymous(set) && - nft_is_active(ctx->net, set)) - list_add_tail_rcu(&set->list, &ctx->table->sets); - - list_add_tail_rcu(&binding->list, &set->bindings); -} -EXPORT_SYMBOL_GPL(nf_tables_rebind_set); - void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_binding *binding) + struct nft_set_binding *binding, bool event) { list_del_rcu(&binding->list); - if (list_empty(&set->bindings) && nft_set_is_anonymous(set) && - nft_is_active(ctx->net, set)) + if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) { list_del_rcu(&set->list); + if (event) + nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, + GFP_KERNEL); + } } EXPORT_SYMBOL_GPL(nf_tables_unbind_set); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set) { - if (list_empty(&set->bindings) && nft_set_is_anonymous(set) && - nft_is_active(ctx->net, set)) { - nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); + if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) nft_set_destroy(set); - } } EXPORT_SYMBOL_GPL(nf_tables_destroy_set); @@ -6535,6 +6525,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_rule_notify(&trans->ctx, nft_trans_rule(trans), NFT_MSG_DELRULE); + nft_rule_expr_deactivate(&trans->ctx, + nft_trans_rule(trans), + NFT_TRANS_COMMIT); break; case NFT_MSG_NEWSET: nft_clear(net, nft_trans_set(trans)); @@ -6621,7 +6614,8 @@ static void nf_tables_abort_release(struct nft_trans *trans) nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_NEWSET: - nft_set_destroy(nft_trans_set(trans)); + if (!nft_trans_set_bound(trans)) + nft_set_destroy(nft_trans_set(trans)); break; case NFT_MSG_NEWSETELEM: nft_set_elem_destroy(nft_trans_elem_set(trans), @@ -6682,7 +6676,9 @@ static int __nf_tables_abort(struct net *net) case NFT_MSG_NEWRULE: trans->ctx.chain->use--; list_del_rcu(&nft_trans_rule(trans)->list); - nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans)); + nft_rule_expr_deactivate(&trans->ctx, + nft_trans_rule(trans), + NFT_TRANS_ABORT); break; case NFT_MSG_DELRULE: trans->ctx.chain->use++; @@ -6692,7 +6688,8 @@ static int __nf_tables_abort(struct net *net) break; case NFT_MSG_NEWSET: trans->ctx.table->use--; - list_del_rcu(&nft_trans_set(trans)->list); + if (!nft_trans_set_bound(trans)) + list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: trans->ctx.table->use++; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 5eb269428832..0732a2fc697c 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -587,10 +587,14 @@ static void nft_compat_activate_tg(const struct nft_ctx *ctx, } static void nft_compat_deactivate(const struct nft_ctx *ctx, - const struct nft_expr *expr) + const struct nft_expr *expr, + enum nft_trans_phase phase) { struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops); + if (phase == NFT_TRANS_COMMIT) + return; + if (--xt->listcnt == 0) list_del_init(&xt->head); } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 07d4efd3d851..f1172f99752b 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -235,20 +235,17 @@ err1: return err; } -static void nft_dynset_activate(const struct nft_ctx *ctx, - const struct nft_expr *expr) -{ - struct nft_dynset *priv = nft_expr_priv(expr); - - nf_tables_rebind_set(ctx, priv->set, &priv->binding); -} - static void nft_dynset_deactivate(const struct nft_ctx *ctx, - const struct nft_expr *expr) + const struct nft_expr *expr, + enum nft_trans_phase phase) { struct nft_dynset *priv = nft_expr_priv(expr); - nf_tables_unbind_set(ctx, priv->set, &priv->binding); + if (phase == NFT_TRANS_PREPARE) + return; + + nf_tables_unbind_set(ctx, priv->set, &priv->binding, + phase == NFT_TRANS_COMMIT); } static void nft_dynset_destroy(const struct nft_ctx *ctx, @@ -296,7 +293,6 @@ static const struct nft_expr_ops nft_dynset_ops = { .eval = nft_dynset_eval, .init = nft_dynset_init, .destroy = nft_dynset_destroy, - .activate = nft_dynset_activate, .deactivate = nft_dynset_deactivate, .dump = nft_dynset_dump, }; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 0777a93211e2..3f6d1d2a6281 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -72,10 +72,14 @@ static void nft_immediate_activate(const struct nft_ctx *ctx, } static void nft_immediate_deactivate(const struct nft_ctx *ctx, - const struct nft_expr *expr) + const struct nft_expr *expr, + enum nft_trans_phase phase) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + if (phase == NFT_TRANS_COMMIT) + return; + return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 227b2b15a19c..14496da5141d 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -121,20 +121,17 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return 0; } -static void nft_lookup_activate(const struct nft_ctx *ctx, - const struct nft_expr *expr) -{ - struct nft_lookup *priv = nft_expr_priv(expr); - - nf_tables_rebind_set(ctx, priv->set, &priv->binding); -} - static void nft_lookup_deactivate(const struct nft_ctx *ctx, - const struct nft_expr *expr) + const struct nft_expr *expr, + enum nft_trans_phase phase) { struct nft_lookup *priv = nft_expr_priv(expr); - nf_tables_unbind_set(ctx, priv->set, &priv->binding); + if (phase == NFT_TRANS_PREPARE) + return; + + nf_tables_unbind_set(ctx, priv->set, &priv->binding, + phase == NFT_TRANS_COMMIT); } static void nft_lookup_destroy(const struct nft_ctx *ctx, @@ -225,7 +222,6 @@ static const struct nft_expr_ops nft_lookup_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), .eval = nft_lookup_eval, .init = nft_lookup_init, - .activate = nft_lookup_activate, .deactivate = nft_lookup_deactivate, .destroy = nft_lookup_destroy, .dump = nft_lookup_dump, diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index a3185ca2a3a9..ae178e914486 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -155,20 +155,17 @@ nla_put_failure: return -1; } -static void nft_objref_map_activate(const struct nft_ctx *ctx, - const struct nft_expr *expr) -{ - struct nft_objref_map *priv = nft_expr_priv(expr); - - nf_tables_rebind_set(ctx, priv->set, &priv->binding); -} - static void nft_objref_map_deactivate(const struct nft_ctx *ctx, - const struct nft_expr *expr) + const struct nft_expr *expr, + enum nft_trans_phase phase) { struct nft_objref_map *priv = nft_expr_priv(expr); - nf_tables_unbind_set(ctx, priv->set, &priv->binding); + if (phase == NFT_TRANS_PREPARE) + return; + + nf_tables_unbind_set(ctx, priv->set, &priv->binding, + phase == NFT_TRANS_COMMIT); } static void nft_objref_map_destroy(const struct nft_ctx *ctx, @@ -185,7 +182,6 @@ static const struct nft_expr_ops nft_objref_map_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), .eval = nft_objref_map_eval, .init = nft_objref_map_init, - .activate = nft_objref_map_activate, .deactivate = nft_objref_map_deactivate, .destroy = nft_objref_map_destroy, .dump = nft_objref_map_dump, -- cgit v1.2.3 From ad6f317f720f4a3121756c23831a43dda9b095e5 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 4 Feb 2019 13:44:44 +0100 Subject: net/smc: preallocated memory for rdma work requests The work requests for rdma writes are built in local variables within function smc_tx_rdma_write(). This violates the rule that the work request storage has to stay till the work request is confirmed by a completion queue response. This patch introduces preallocated memory for these work requests. The storage is allocated, once a link (and thus a queue pair) is established. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 11 +++-------- net/smc/smc_cdc.h | 8 +++++++- net/smc/smc_core.h | 20 ++++++++++++++++++++ net/smc/smc_llc.c | 3 ++- net/smc/smc_tx.c | 44 ++++++++++++++++++++++---------------------- net/smc/smc_wr.c | 38 +++++++++++++++++++++++++++++++++++++- net/smc/smc_wr.h | 1 + 7 files changed, 92 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 1c5333d494e9..b80ef104ab4e 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -21,13 +21,6 @@ /********************************** send *************************************/ -struct smc_cdc_tx_pend { - struct smc_connection *conn; /* socket connection */ - union smc_host_cursor cursor; /* tx sndbuf cursor sent */ - union smc_host_cursor p_cursor; /* rx RMBE cursor produced */ - u16 ctrl_seq; /* conn. tx sequence # */ -}; - /* handler for send/transmission completion of a CDC msg */ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, struct smc_link *link, @@ -61,12 +54,14 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, int smc_cdc_get_free_slot(struct smc_connection *conn, struct smc_wr_buf **wr_buf, + struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend) { struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; int rc; rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, + wr_rdma_buf, (struct smc_wr_tx_pend_priv **)pend); if (!conn->alert_token_local) /* abnormal termination */ @@ -121,7 +116,7 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) struct smc_wr_buf *wr_buf; int rc; - rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); + rc = smc_cdc_get_free_slot(conn, &wr_buf, NULL, &pend); if (rc) return rc; diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index b5bfe38c7f9b..2148da7a26b1 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -270,10 +270,16 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, smcr_cdc_msg_to_host(local, peer, conn); } -struct smc_cdc_tx_pend; +struct smc_cdc_tx_pend { + struct smc_connection *conn; /* socket connection */ + union smc_host_cursor cursor; /* tx sndbuf cursor sent */ + union smc_host_cursor p_cursor; /* rx RMBE cursor produced */ + u16 ctrl_seq; /* conn. tx sequence # */ +}; int smc_cdc_get_free_slot(struct smc_connection *conn, struct smc_wr_buf **wr_buf, + struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend); void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index b00287989a3d..8806d2afa6ed 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -52,6 +52,24 @@ enum smc_wr_reg_state { FAILED /* ib_wr_reg_mr response: failure */ }; +struct smc_rdma_sge { /* sges for RDMA writes */ + struct ib_sge wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE]; +}; + +#define SMC_MAX_RDMA_WRITES 2 /* max. # of RDMA writes per + * message send + */ + +struct smc_rdma_sges { /* sges per message send */ + struct smc_rdma_sge tx_rdma_sge[SMC_MAX_RDMA_WRITES]; +}; + +struct smc_rdma_wr { /* work requests per message + * send + */ + struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES]; +}; + struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ @@ -64,6 +82,8 @@ struct smc_link { struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */ struct ib_sge *wr_tx_sges; /* WR send gather meta data */ + struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/ + struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ /* above four vectors have wr_tx_cnt elements and use the same index */ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index a6d3623d06f4..4fd60c522802 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -166,7 +166,8 @@ static int smc_llc_add_pending_send(struct smc_link *link, { int rc; - rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend); + rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, NULL, + pend); if (rc < 0) return rc; BUILD_BUG_ON_MSG( diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 36af3de731b9..2fdfaff60cf9 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -266,27 +266,23 @@ int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, /* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, - int num_sges, struct ib_sge sges[]) + int num_sges, struct ib_rdma_wr *rdma_wr) { struct smc_link_group *lgr = conn->lgr; - struct ib_rdma_wr rdma_wr; struct smc_link *link; int rc; - memset(&rdma_wr, 0, sizeof(rdma_wr)); link = &lgr->lnk[SMC_SINGLE_LINK]; - rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link); - rdma_wr.wr.sg_list = sges; - rdma_wr.wr.num_sge = num_sges; - rdma_wr.wr.opcode = IB_WR_RDMA_WRITE; - rdma_wr.remote_addr = + rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); + rdma_wr->wr.num_sge = num_sges; + rdma_wr->remote_addr = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + /* RMBE within RMB */ conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; - rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; - rc = ib_post_send(link->roce_qp, &rdma_wr.wr, NULL); + rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; + rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) { conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; smc_lgr_terminate(lgr); @@ -313,24 +309,25 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn, /* SMC-R helper for smc_tx_rdma_writes() */ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t src_off, size_t src_len, - size_t dst_off, size_t dst_len) + size_t dst_off, size_t dst_len, + struct smc_rdma_wr *wr_rdma_buf) { dma_addr_t dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; int src_len_sum = src_len, dst_len_sum = dst_len; - struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; int sent_count = src_off; int srcchunk, dstchunk; int num_sges; int rc; for (dstchunk = 0; dstchunk < 2; dstchunk++) { + struct ib_sge *sge = + wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list; + num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sges[srcchunk].addr = dma_addr + src_off; - sges[srcchunk].length = src_len; - sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; + sge[srcchunk].addr = dma_addr + src_off; + sge[srcchunk].length = src_len; num_sges++; src_off += src_len; @@ -343,7 +340,8 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } - rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges); + rc = smc_tx_rdma_write(conn, dst_off, num_sges, + &wr_rdma_buf->wr_tx_rdma[dstchunk]); if (rc) return rc; if (dst_len_sum == len) @@ -402,7 +400,8 @@ static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; * usable snd_wnd as max transmit */ -static int smc_tx_rdma_writes(struct smc_connection *conn) +static int smc_tx_rdma_writes(struct smc_connection *conn, + struct smc_rdma_wr *wr_rdma_buf) { size_t len, src_len, dst_off, dst_len; /* current chunk values */ union smc_host_cursor sent, prep, prod, cons; @@ -463,7 +462,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) dst_off, dst_len); else rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len, - dst_off, dst_len); + dst_off, dst_len, wr_rdma_buf); if (rc) return rc; @@ -484,11 +483,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags; + struct smc_rdma_wr *wr_rdma_buf; struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; int rc; - rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); + rc = smc_cdc_get_free_slot(conn, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { if (rc == -EBUSY) { struct smc_sock *smc = @@ -506,7 +506,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) spin_lock_bh(&conn->send_lock); if (!conn->local_tx_ctrl.prod_flags.urg_data_present) { - rc = smc_tx_rdma_writes(conn); + rc = smc_tx_rdma_writes(conn, wr_rdma_buf); if (rc) { smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], (struct smc_wr_tx_pend_priv *)pend); @@ -533,7 +533,7 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) spin_lock_bh(&conn->send_lock); if (!pflags->urg_data_present) - rc = smc_tx_rdma_writes(conn); + rc = smc_tx_rdma_writes(conn, NULL); if (!rc) rc = smcd_cdc_msg_send(conn); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 1dc88c32d6bb..253aa75dc2b6 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -160,6 +160,7 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) * @link: Pointer to smc_link used to later send the message. * @handler: Send completion handler function pointer. * @wr_buf: Out value returns pointer to message buffer. + * @wr_rdma_buf: Out value returns pointer to rdma work request. * @wr_pend_priv: Out value returns pointer serving as handler context. * * Return: 0 on success, or -errno on error. @@ -167,6 +168,7 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler, struct smc_wr_buf **wr_buf, + struct smc_rdma_wr **wr_rdma_buf, struct smc_wr_tx_pend_priv **wr_pend_priv) { struct smc_wr_tx_pend *wr_pend; @@ -204,6 +206,8 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, wr_ib = &link->wr_tx_ibs[idx]; wr_ib->wr_id = wr_id; *wr_buf = &link->wr_tx_bufs[idx]; + if (wr_rdma_buf) + *wr_rdma_buf = &link->wr_tx_rdmas[idx]; *wr_pend_priv = &wr_pend->priv; return 0; } @@ -465,12 +469,26 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey = + lnk->roce_pd->local_dma_lkey; + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey = + lnk->roce_pd->local_dma_lkey; + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey = + lnk->roce_pd->local_dma_lkey; + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey = + lnk->roce_pd->local_dma_lkey; lnk->wr_tx_ibs[i].next = NULL; lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; lnk->wr_tx_ibs[i].num_sge = 1; lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; lnk->wr_tx_ibs[i].send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; + lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; + lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE; + lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list = + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge; + lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list = + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge; } for (i = 0; i < lnk->wr_rx_cnt; i++) { lnk->wr_rx_sges[i].addr = @@ -521,8 +539,12 @@ void smc_wr_free_link_mem(struct smc_link *lnk) lnk->wr_tx_mask = NULL; kfree(lnk->wr_tx_sges); lnk->wr_tx_sges = NULL; + kfree(lnk->wr_tx_rdma_sges); + lnk->wr_tx_rdma_sges = NULL; kfree(lnk->wr_rx_sges); lnk->wr_rx_sges = NULL; + kfree(lnk->wr_tx_rdmas); + lnk->wr_tx_rdmas = NULL; kfree(lnk->wr_rx_ibs); lnk->wr_rx_ibs = NULL; kfree(lnk->wr_tx_ibs); @@ -552,10 +574,20 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_rx_ibs) goto no_mem_wr_tx_ibs; + link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_rdmas[0]), + GFP_KERNEL); + if (!link->wr_tx_rdmas) + goto no_mem_wr_rx_ibs; + link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_rdma_sges[0]), + GFP_KERNEL); + if (!link->wr_tx_rdma_sges) + goto no_mem_wr_tx_rdmas; link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]), GFP_KERNEL); if (!link->wr_tx_sges) - goto no_mem_wr_rx_ibs; + goto no_mem_wr_tx_rdma_sges; link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, sizeof(link->wr_rx_sges[0]), GFP_KERNEL); @@ -579,6 +611,10 @@ no_mem_wr_rx_sges: kfree(link->wr_rx_sges); no_mem_wr_tx_sges: kfree(link->wr_tx_sges); +no_mem_wr_tx_rdma_sges: + kfree(link->wr_tx_rdma_sges); +no_mem_wr_tx_rdmas: + kfree(link->wr_tx_rdmas); no_mem_wr_rx_ibs: kfree(link->wr_rx_ibs); no_mem_wr_tx_ibs: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 1d85bb14fd6f..09bf32fd3959 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -85,6 +85,7 @@ void smc_wr_add_dev(struct smc_ib_device *smcibdev); int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler, struct smc_wr_buf **wr_buf, + struct smc_rdma_wr **wrs, struct smc_wr_tx_pend_priv **wr_pend_priv); int smc_wr_tx_put_slot(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); -- cgit v1.2.3 From b8649efad879c69c7ab1f19ce8814fcabef1f72b Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 4 Feb 2019 13:44:45 +0100 Subject: net/smc: fix sender_free computation In some scenarios a separate consumer cursor update is necessary. The decision is made in smc_tx_consumer_cursor_update(). The sender_free computation could be wrong: The rx confirmed cursor is always smaller than or equal to the rx producer cursor. The parameters in the smc_curs_diff() call have to be exchanged, otherwise sender_free might even be negative. And if more data arrives local_rx_ctrl.prod might be updated, enabling a cursor difference between local_rx_ctrl.prod and rx confirmed cursor larger than the RMB size. This case is not covered by smc_curs_diff(). Thus function smc_curs_diff_large() is introduced here. If a recvmsg() is processed in parallel, local_tx_ctrl.cons might change during smc_cdc_msg_send. Make sure rx_curs_confirmed is updated with the actually sent local_tx_ctrl.cons value. Fixes: e82f2e31f559 ("net/smc: optimize consumer cursor updates") Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 5 +++-- net/smc/smc_cdc.h | 26 +++++++++++++++++++++++++- net/smc/smc_tx.c | 3 ++- 3 files changed, 30 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index b80ef104ab4e..a712c9f8699b 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -91,6 +91,7 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend) { + union smc_host_cursor cfed; struct smc_link *link; int rc; @@ -102,10 +103,10 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, &conn->local_tx_ctrl, conn); + smc_curs_copy(&cfed, &((struct smc_host_cdc_msg *)wr_buf)->cons, conn); rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); if (!rc) - smc_curs_copy(&conn->rx_curs_confirmed, - &conn->local_tx_ctrl.cons, conn); + smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); return rc; } diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 2148da7a26b1..271e2524dc8f 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -160,7 +160,9 @@ static inline void smcd_curs_copy(union smcd_cdc_cursor *tgt, #endif } -/* calculate cursor difference between old and new, where old <= new */ +/* calculate cursor difference between old and new, where old <= new and + * difference cannot exceed size + */ static inline int smc_curs_diff(unsigned int size, union smc_host_cursor *old, union smc_host_cursor *new) @@ -185,6 +187,28 @@ static inline int smc_curs_comp(unsigned int size, return smc_curs_diff(size, old, new); } +/* calculate cursor difference between old and new, where old <= new and + * difference may exceed size + */ +static inline int smc_curs_diff_large(unsigned int size, + union smc_host_cursor *old, + union smc_host_cursor *new) +{ + if (old->wrap < new->wrap) + return min_t(int, + (size - old->count) + new->count + + (new->wrap - old->wrap - 1) * size, + size); + + if (old->wrap > new->wrap) /* wrap has switched from 0xffff to 0x0000 */ + return min_t(int, + (size - old->count) + new->count + + (new->wrap + 0xffff - old->wrap) * size, + size); + + return max_t(int, 0, (new->count - old->count)); +} + static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer, union smc_host_cursor *local, struct smc_connection *conn) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 2fdfaff60cf9..f93f3580c100 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -595,7 +595,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) if (to_confirm > conn->rmbe_update_limit) { smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn); sender_free = conn->rmb_desc->len - - smc_curs_diff(conn->rmb_desc->len, &prod, &cfed); + smc_curs_diff_large(conn->rmb_desc->len, + &cfed, &prod); } if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || -- cgit v1.2.3 From a5e04318c83a31925300af1ce358dbc1a708b732 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 4 Feb 2019 13:44:46 +0100 Subject: net/smc: delete rkey first before switching to unused Once RMBs are flagged as unused they are candidates for reuse. Thus the LLC DELETE RKEY operaton should be made before flagging the RMB as unused. Fixes: c7674c001b11 ("net/smc: unregister rkeys of unused buffer") Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 097c798983ca..aa1c551cee81 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -302,13 +302,13 @@ static void smc_buf_unuse(struct smc_connection *conn, conn->sndbuf_desc->used = 0; if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { - conn->rmb_desc->used = 0; if (!lgr->is_smcd) { /* unregister rmb with peer */ smc_llc_do_delete_rkey( &lgr->lnk[SMC_SINGLE_LINK], conn->rmb_desc); } + conn->rmb_desc->used = 0; } else { /* buf registration failed, reuse not possible */ write_lock_bh(&lgr->rmbs_lock); -- cgit v1.2.3 From 84b799a292ebc03de388f8573a169de6eb12c340 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 4 Feb 2019 13:44:47 +0100 Subject: net/smc: correct state change for peer closing If some kind of closing is received from the peer while still in state SMC_INIT, it means the peer has had an active connection and closed the socket quickly before listen_work finished. This should not result in a shortcut from state SMC_INIT to state SMC_CLOSED. This patch adds the socket to the accept queue in state SMC_APPCLOSEWAIT1. The socket reaches state SMC_CLOSED once being accepted and closed with smc_release(). Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_close.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net') diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index ea2b87f29469..e39cadda1bf5 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -345,14 +345,7 @@ static void smc_close_passive_work(struct work_struct *work) switch (sk->sk_state) { case SMC_INIT: - if (atomic_read(&conn->bytes_to_rcv) || - (rxflags->peer_done_writing && - !smc_cdc_rxed_any_close(conn))) { - sk->sk_state = SMC_APPCLOSEWAIT1; - } else { - sk->sk_state = SMC_CLOSED; - sock_put(sk); /* passive closing */ - } + sk->sk_state = SMC_APPCLOSEWAIT1; break; case SMC_ACTIVE: sk->sk_state = SMC_APPCLOSEWAIT1; -- cgit v1.2.3 From c1f7e02979edd7a3a3e69fe04be60b1d650dc8a7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 4 Feb 2019 14:50:38 +0000 Subject: net: cls_flower: Remove filter from mask before freeing it In fl_change(), when adding a new rule (i.e. fold == NULL), a driver may reject the new rule, for example due to resource exhaustion. By that point, the new rule was already assigned a mask, and it was added to that mask's hash table. The clean-up path that's invoked as a result of the rejection however neglects to undo the hash table addition, and proceeds to free the new rule, thus leaving a dangling pointer in the hash table. Fix by removing fnew from the mask's hash table before it is freed. Fixes: 35cc3cefc4de ("net/sched: cls_flower: Reject duplicated rules also under skip_sw") Signed-off-by: Petr Machata Acked-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index f6aa57fbbbaf..12ca9d13db83 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1371,7 +1371,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tc_skip_hw(fnew->flags)) { err = fl_hw_replace_filter(tp, fnew, extack); if (err) - goto errout_mask; + goto errout_mask_ht; } if (!tc_in_hw(fnew->flags)) @@ -1401,6 +1401,10 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, kfree(mask); return 0; +errout_mask_ht: + rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node, + fnew->mask->filter_ht_params); + errout_mask: fl_mask_put(head, fnew->mask, false); -- cgit v1.2.3 From 17ab4f61b8cd6f9c38e9d0b935d86d73b5d0d2b5 Mon Sep 17 00:00:00 2001 From: Rundong Ge Date: Sat, 2 Feb 2019 14:29:35 +0000 Subject: net: dsa: slave: Don't propagate flag changes on down slave interfaces The unbalance of master's promiscuity or allmulti will happen after ifdown and ifup a slave interface which is in a bridge. When we ifdown a slave interface , both the 'dsa_slave_close' and 'dsa_slave_change_rx_flags' will clear the master's flags. The flags of master will be decrease twice. In the other hand, if we ifup the slave interface again, since the slave's flags were cleared the 'dsa_slave_open' won't set the master's flag, only 'dsa_slave_change_rx_flags' that triggered by 'br_add_if' will set the master's flags. The flags of master is increase once. Only propagating flag changes when a slave interface is up makes sure this does not happen. The 'vlan_dev_change_rx_flags' had the same problem and was fixed, and changes here follows that fix. Fixes: 91da11f870f0 ("net: Distributed Switch Architecture protocol support") Signed-off-by: Rundong Ge Signed-off-by: David S. Miller --- net/dsa/slave.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a3fcc1d01615..b5e44825d173 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -140,11 +140,14 @@ static int dsa_slave_close(struct net_device *dev) static void dsa_slave_change_rx_flags(struct net_device *dev, int change) { struct net_device *master = dsa_slave_to_master(dev); - - if (change & IFF_ALLMULTI) - dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1); - if (change & IFF_PROMISC) - dev_set_promiscuity(master, dev->flags & IFF_PROMISC ? 1 : -1); + if (dev->flags & IFF_UP) { + if (change & IFF_ALLMULTI) + dev_set_allmulti(master, + dev->flags & IFF_ALLMULTI ? 1 : -1); + if (change & IFF_PROMISC) + dev_set_promiscuity(master, + dev->flags & IFF_PROMISC ? 1 : -1); + } } static void dsa_slave_set_rx_mode(struct net_device *dev) -- cgit v1.2.3 From c8101f7729daee251f4f6505f9d135ec08e1342f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 2 Feb 2019 17:53:29 +0000 Subject: net: dsa: Fix lockdep false positive splat Creating a macvtap on a DSA-backed interface results in the following splat when lockdep is enabled: [ 19.638080] IPv6: ADDRCONF(NETDEV_CHANGE): lan0: link becomes ready [ 23.041198] device lan0 entered promiscuous mode [ 23.043445] device eth0 entered promiscuous mode [ 23.049255] [ 23.049557] ============================================ [ 23.055021] WARNING: possible recursive locking detected [ 23.060490] 5.0.0-rc3-00013-g56c857a1b8d3 #118 Not tainted [ 23.066132] -------------------------------------------- [ 23.071598] ip/2861 is trying to acquire lock: [ 23.076171] 00000000f61990cb (_xmit_ETHER){+...}, at: dev_set_rx_mode+0x1c/0x38 [ 23.083693] [ 23.083693] but task is already holding lock: [ 23.089696] 00000000ecf0c3b4 (_xmit_ETHER){+...}, at: dev_uc_add+0x24/0x70 [ 23.096774] [ 23.096774] other info that might help us debug this: [ 23.103494] Possible unsafe locking scenario: [ 23.103494] [ 23.109584] CPU0 [ 23.112093] ---- [ 23.114601] lock(_xmit_ETHER); [ 23.117917] lock(_xmit_ETHER); [ 23.121233] [ 23.121233] *** DEADLOCK *** [ 23.121233] [ 23.127325] May be due to missing lock nesting notation [ 23.127325] [ 23.134315] 2 locks held by ip/2861: [ 23.137987] #0: 000000003b766c72 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x338/0x4e0 [ 23.146231] #1: 00000000ecf0c3b4 (_xmit_ETHER){+...}, at: dev_uc_add+0x24/0x70 [ 23.153757] [ 23.153757] stack backtrace: [ 23.158243] CPU: 0 PID: 2861 Comm: ip Not tainted 5.0.0-rc3-00013-g56c857a1b8d3 #118 [ 23.166212] Hardware name: Globalscale Marvell ESPRESSOBin Board (DT) [ 23.172843] Call trace: [ 23.175358] dump_backtrace+0x0/0x188 [ 23.179116] show_stack+0x14/0x20 [ 23.182524] dump_stack+0xb4/0xec [ 23.185928] __lock_acquire+0x123c/0x1860 [ 23.190048] lock_acquire+0xc8/0x248 [ 23.193724] _raw_spin_lock_bh+0x40/0x58 [ 23.197755] dev_set_rx_mode+0x1c/0x38 [ 23.201607] dev_set_promiscuity+0x3c/0x50 [ 23.205820] dsa_slave_change_rx_flags+0x5c/0x70 [ 23.210567] __dev_set_promiscuity+0x148/0x1e0 [ 23.215136] __dev_set_rx_mode+0x74/0x98 [ 23.219167] dev_uc_add+0x54/0x70 [ 23.222575] macvlan_open+0x170/0x1d0 [ 23.226336] __dev_open+0xe0/0x160 [ 23.229830] __dev_change_flags+0x16c/0x1b8 [ 23.234132] dev_change_flags+0x20/0x60 [ 23.238074] do_setlink+0x2d0/0xc50 [ 23.241658] __rtnl_newlink+0x5f8/0x6e8 [ 23.245601] rtnl_newlink+0x50/0x78 [ 23.249184] rtnetlink_rcv_msg+0x360/0x4e0 [ 23.253397] netlink_rcv_skb+0xe8/0x130 [ 23.257338] rtnetlink_rcv+0x14/0x20 [ 23.261012] netlink_unicast+0x190/0x210 [ 23.265043] netlink_sendmsg+0x288/0x350 [ 23.269075] sock_sendmsg+0x18/0x30 [ 23.272659] ___sys_sendmsg+0x29c/0x2c8 [ 23.276602] __sys_sendmsg+0x60/0xb8 [ 23.280276] __arm64_sys_sendmsg+0x1c/0x28 [ 23.284488] el0_svc_common+0xd8/0x138 [ 23.288340] el0_svc_handler+0x24/0x80 [ 23.292192] el0_svc+0x8/0xc This looks fairly harmless (no actual deadlock occurs), and is fixed in a similar way to c6894dec8ea9 ("bridge: fix lockdep addr_list_lock false positive splat") by putting the addr_list_lock in its own lockdep class. Signed-off-by: Marc Zyngier Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/master.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/dsa/master.c b/net/dsa/master.c index 71bb15f491c8..54f5551fb799 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -205,6 +205,8 @@ static void dsa_master_reset_mtu(struct net_device *dev) rtnl_unlock(); } +static struct lock_class_key dsa_master_addr_list_lock_key; + int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { int ret; @@ -218,6 +220,8 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) wmb(); dev->dsa_ptr = cpu_dp; + lockdep_set_class(&dev->addr_list_lock, + &dsa_master_addr_list_lock_key); ret = dsa_master_ethtool_setup(dev); if (ret) -- cgit v1.2.3 From 15df03c661cb362366ecfc3a21820cb934f3e4ca Mon Sep 17 00:00:00 2001 From: Eli Cooper Date: Mon, 21 Jan 2019 18:45:27 +0800 Subject: netfilter: ipv6: Don't preserve original oif for loopback address Commit 508b09046c0f ("netfilter: ipv6: Preserve link scope traffic original oif") made ip6_route_me_harder() keep the original oif for link-local and multicast packets. However, it also affected packets for the loopback address because it used rt6_need_strict(). REDIRECT rules in the OUTPUT chain rewrite the destination to loopback address; thus its oif should not be preserved. This commit fixes the bug that redirected local packets are being dropped. Actually the packet was not exactly dropped; Instead it was sent out to the original oif rather than lo. When a packet with daddr ::1 is sent to the router, it is effectively dropped. Fixes: 508b09046c0f ("netfilter: ipv6: Preserve link scope traffic original oif") Signed-off-by: Eli Cooper Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 8b075f0bc351..6d0b1f3e927b 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -23,9 +23,11 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb) struct sock *sk = sk_to_full_sk(skb->sk); unsigned int hh_len; struct dst_entry *dst; + int strict = (ipv6_addr_type(&iph->daddr) & + (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); struct flowi6 fl6 = { .flowi6_oif = sk && sk->sk_bound_dev_if ? sk->sk_bound_dev_if : - rt6_need_strict(&iph->daddr) ? skb_dst(skb)->dev->ifindex : 0, + strict ? skb_dst(skb)->dev->ifindex : 0, .flowi6_mark = skb->mark, .flowi6_uid = sock_net_uid(net, sk), .daddr = iph->daddr, -- cgit v1.2.3 From 947e492c0fc2132ae5fca081a9c2952ccaab0404 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 5 Feb 2019 12:16:18 +0100 Subject: netfilter: nft_compat: don't use refcount_inc on newly allocated entry When I moved the refcount to refcount_t type I missed the fact that refcount_inc() will result in use-after-free warning with CONFIG_REFCOUNT_FULL=y builds. The correct fix would be to init the reference count to 1 at allocation time, but, unfortunately we cannot do this, as we can't undo that in case something else fails later in the batch. So only solution I see is to special-case the 'new entry' condition and replace refcount_inc() with a "delayed" refcount_set(1) in this case, as done here. The .activate callback can be removed to simplify things, we only need to make sure that deactivate() decrements/unlinks the entry from the list at end of transaction phase (commit or abort). Fixes: 12c44aba6618 ("netfilter: nft_compat: use refcnt_t type for nft_xt reference count") Reported-by: Jordan Glover Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 62 +++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 0732a2fc697c..fe64df848365 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -61,6 +61,21 @@ static struct nft_compat_net *nft_compat_pernet(struct net *net) return net_generic(net, nft_compat_net_id); } +static void nft_xt_get(struct nft_xt *xt) +{ + /* refcount_inc() warns on 0 -> 1 transition, but we can't + * init the reference count to 1 in .select_ops -- we can't + * undo such an increase when another expression inside the same + * rule fails afterwards. + */ + if (xt->listcnt == 0) + refcount_set(&xt->refcnt, 1); + else + refcount_inc(&xt->refcnt); + + xt->listcnt++; +} + static bool nft_xt_put(struct nft_xt *xt) { if (refcount_dec_and_test(&xt->refcnt)) { @@ -291,7 +306,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; nft_xt = container_of(expr->ops, struct nft_xt, ops); - refcount_inc(&nft_xt->refcnt); + nft_xt_get(nft_xt); return 0; } @@ -504,7 +519,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return ret; nft_xt = container_of(expr->ops, struct nft_xt, ops); - refcount_inc(&nft_xt->refcnt); + nft_xt_get(nft_xt); return 0; } @@ -558,45 +573,16 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) __nft_match_destroy(ctx, expr, nft_expr_priv(expr)); } -static void nft_compat_activate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - struct list_head *h) -{ - struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops); - - if (xt->listcnt == 0) - list_add(&xt->head, h); - - xt->listcnt++; -} - -static void nft_compat_activate_mt(const struct nft_ctx *ctx, - const struct nft_expr *expr) -{ - struct nft_compat_net *cn = nft_compat_pernet(ctx->net); - - nft_compat_activate(ctx, expr, &cn->nft_match_list); -} - -static void nft_compat_activate_tg(const struct nft_ctx *ctx, - const struct nft_expr *expr) -{ - struct nft_compat_net *cn = nft_compat_pernet(ctx->net); - - nft_compat_activate(ctx, expr, &cn->nft_target_list); -} - static void nft_compat_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops); - if (phase == NFT_TRANS_COMMIT) - return; - - if (--xt->listcnt == 0) - list_del_init(&xt->head); + if (phase == NFT_TRANS_ABORT || phase == NFT_TRANS_COMMIT) { + if (--xt->listcnt == 0) + list_del_init(&xt->head); + } } static void @@ -852,7 +838,6 @@ nft_match_select_ops(const struct nft_ctx *ctx, nft_match->ops.eval = nft_match_eval; nft_match->ops.init = nft_match_init; nft_match->ops.destroy = nft_match_destroy; - nft_match->ops.activate = nft_compat_activate_mt; nft_match->ops.deactivate = nft_compat_deactivate; nft_match->ops.dump = nft_match_dump; nft_match->ops.validate = nft_match_validate; @@ -870,7 +855,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, nft_match->ops.size = matchsize; - nft_match->listcnt = 1; + nft_match->listcnt = 0; list_add(&nft_match->head, &cn->nft_match_list); return &nft_match->ops; @@ -957,7 +942,6 @@ nft_target_select_ops(const struct nft_ctx *ctx, nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); nft_target->ops.init = nft_target_init; nft_target->ops.destroy = nft_target_destroy; - nft_target->ops.activate = nft_compat_activate_tg; nft_target->ops.deactivate = nft_compat_deactivate; nft_target->ops.dump = nft_target_dump; nft_target->ops.validate = nft_target_validate; @@ -968,7 +952,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, else nft_target->ops.eval = nft_target_eval_xt; - nft_target->listcnt = 1; + nft_target->listcnt = 0; list_add(&nft_target->head, &cn->nft_target_list); return &nft_target->ops; -- cgit v1.2.3 From 0acd99282bef617fdbc4dff29359fe8160f00846 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Feb 2019 07:59:41 +0200 Subject: cfg80211: pmsr: fix MAC address setting When we *don't* have a MAC address attribute, we shouldn't try to use this - this was intended to copy the local MAC address instead, so fix it. Fixes: 9bb7e0f24e7e ("cfg80211: add peer measurement with FTM initiator API") Signed-off-by: Johannes Berg --- net/wireless/pmsr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index de9286703280..f2e388e329fd 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -256,8 +256,7 @@ int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info) if (err) goto out_err; } else { - memcpy(req->mac_addr, nla_data(info->attrs[NL80211_ATTR_MAC]), - ETH_ALEN); + memcpy(req->mac_addr, wdev_address(wdev), ETH_ALEN); memset(req->mac_addr_mask, 0xff, ETH_ALEN); } -- cgit v1.2.3 From 73350424bec9c76cf42d4d502ff156c7d5daf191 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Feb 2019 08:03:10 +0200 Subject: cfg80211: pmsr: fix abort locking When we destroy the interface we already hold the wdev->mtx while calling cfg80211_pmsr_wdev_down(), which assumes this isn't true and flushes the worker that takes the lock, thus leading to a deadlock. Fix this by refactoring the worker and calling its code in cfg80211_pmsr_wdev_down() directly. We still need to flush the work later to make sure it's not still running and will crash, but it will not do anything. Fixes: 9bb7e0f24e7e ("cfg80211: add peer measurement with FTM initiator API") Signed-off-by: Johannes Berg --- net/wireless/core.c | 2 ++ net/wireless/pmsr.c | 22 +++++++++++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 623dfe5e211c..b36ad8efb5e5 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1068,6 +1068,8 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) ASSERT_RTNL(); + flush_work(&wdev->pmsr_free_wk); + nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); list_del_rcu(&wdev->list); diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index f2e388e329fd..78c3f5633692 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -529,14 +529,14 @@ free: } EXPORT_SYMBOL_GPL(cfg80211_pmsr_report); -void cfg80211_pmsr_free_wk(struct work_struct *work) +static void cfg80211_pmsr_process_abort(struct wireless_dev *wdev) { - struct wireless_dev *wdev = container_of(work, struct wireless_dev, - pmsr_free_wk); struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmsr_request *req, *tmp; LIST_HEAD(free_list); + lockdep_assert_held(&wdev->mtx); + spin_lock_bh(&wdev->pmsr_lock); list_for_each_entry_safe(req, tmp, &wdev->pmsr_list, list) { if (req->nl_portid) @@ -546,14 +546,22 @@ void cfg80211_pmsr_free_wk(struct work_struct *work) spin_unlock_bh(&wdev->pmsr_lock); list_for_each_entry_safe(req, tmp, &free_list, list) { - wdev_lock(wdev); rdev_abort_pmsr(rdev, wdev, req); - wdev_unlock(wdev); kfree(req); } } +void cfg80211_pmsr_free_wk(struct work_struct *work) +{ + struct wireless_dev *wdev = container_of(work, struct wireless_dev, + pmsr_free_wk); + + wdev_lock(wdev); + cfg80211_pmsr_process_abort(wdev); + wdev_unlock(wdev); +} + void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev) { struct cfg80211_pmsr_request *req; @@ -567,8 +575,8 @@ void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev) spin_unlock_bh(&wdev->pmsr_lock); if (found) - schedule_work(&wdev->pmsr_free_wk); - flush_work(&wdev->pmsr_free_wk); + cfg80211_pmsr_process_abort(wdev); + WARN_ON(!list_empty(&wdev->pmsr_list)); } -- cgit v1.2.3 From 6dce3c20ac429e7a651d728e375853370c796e8d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Feb 2019 08:36:06 -0800 Subject: rxrpc: bad unlock balance in rxrpc_recvmsg When either "goto wait_interrupted;" or "goto wait_error;" paths are taken, socket lock has already been released. This patch fixes following syzbot splat : WARNING: bad unlock balance detected! 5.0.0-rc4+ #59 Not tainted ------------------------------------- syz-executor223/8256 is trying to release lock (sk_lock-AF_RXRPC) at: [] rxrpc_recvmsg+0x6d3/0x3099 net/rxrpc/recvmsg.c:598 but there are no more locks to release! other info that might help us debug this: 1 lock held by syz-executor223/8256: #0: 00000000fa9ed0f4 (slock-AF_RXRPC){+...}, at: spin_lock_bh include/linux/spinlock.h:334 [inline] #0: 00000000fa9ed0f4 (slock-AF_RXRPC){+...}, at: release_sock+0x20/0x1c0 net/core/sock.c:2798 stack backtrace: CPU: 1 PID: 8256 Comm: syz-executor223 Not tainted 5.0.0-rc4+ #59 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_unlock_imbalance_bug kernel/locking/lockdep.c:3391 [inline] print_unlock_imbalance_bug.cold+0x114/0x123 kernel/locking/lockdep.c:3368 __lock_release kernel/locking/lockdep.c:3601 [inline] lock_release+0x67e/0xa00 kernel/locking/lockdep.c:3860 sock_release_ownership include/net/sock.h:1471 [inline] release_sock+0x183/0x1c0 net/core/sock.c:2808 rxrpc_recvmsg+0x6d3/0x3099 net/rxrpc/recvmsg.c:598 sock_recvmsg_nosec net/socket.c:794 [inline] sock_recvmsg net/socket.c:801 [inline] sock_recvmsg+0xd0/0x110 net/socket.c:797 __sys_recvfrom+0x1ff/0x350 net/socket.c:1845 __do_sys_recvfrom net/socket.c:1863 [inline] __se_sys_recvfrom net/socket.c:1859 [inline] __x64_sys_recvfrom+0xe1/0x1a0 net/socket.c:1859 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x446379 Code: e8 2c b3 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 2b 09 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fe5da89fd98 EFLAGS: 00000246 ORIG_RAX: 000000000000002d RAX: ffffffffffffffda RBX: 00000000006dbc28 RCX: 0000000000446379 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 00000000006dbc20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dbc2c R13: 0000000000000000 R14: 0000000000000000 R15: 20c49ba5e353f7cf Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: Eric Dumazet Cc: David Howells Reported-by: syzbot Signed-off-by: David S. Miller --- net/rxrpc/recvmsg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index eaf19ebaa964..3f7bb11f3290 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -596,6 +596,7 @@ error_requeue_call: } error_no_call: release_sock(&rx->sk); +error_trace: trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); return ret; @@ -604,7 +605,7 @@ wait_interrupted: wait_error: finish_wait(sk_sleep(&rx->sk), &wait); call = NULL; - goto error_no_call; + goto error_trace; } /** -- cgit v1.2.3 From e248aa7be86e8179f20ac0931774ecd746f3f5bf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 25 Jan 2019 16:54:54 -0500 Subject: svcrdma: Remove max_sge check at connect time Two and a half years ago, the client was changed to use gathered Send for larger inline messages, in commit 655fec6987b ("xprtrdma: Use gathered Send for large inline messages"). Several fixes were required because there are a few in-kernel device drivers whose max_sge is 3, and these were broken by the change. Apparently my memory is going, because some time later, I submitted commit 25fd86eca11c ("svcrdma: Don't overrun the SGE array in svc_rdma_send_ctxt"), and after that, commit f3c1fd0ee294 ("svcrdma: Reduce max_send_sges"). These too incorrectly assumed in-kernel device drivers would have more than a few Send SGEs available. The fix for the server side is not the same. This is because the fundamental problem on the server is that, whether or not the client has provisioned a chunk for the RPC reply, the server must squeeze even the most complex RPC replies into a single RDMA Send. Failing in the send path because of Send SGE exhaustion should never be an option. Therefore, instead of failing when the send path runs out of SGEs, switch to using a bounce buffer mechanism to handle RPC replies that are too complex for the device to send directly. That allows us to remove the max_sge check to enable drivers with small max_sge to work again. Reported-by: Don Dutile Fixes: 25fd86eca11c ("svcrdma: Don't overrun the SGE array in ...") Cc: stable@vger.kernel.org Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 105 +++++++++++++++++++++++++++++-- net/sunrpc/xprtrdma/svc_rdma_transport.c | 9 +-- 2 files changed, 102 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index cf51b8f9b15f..1f200119268c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -537,6 +537,99 @@ void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, DMA_TO_DEVICE); } +/* If the xdr_buf has more elements than the device can + * transmit in a single RDMA Send, then the reply will + * have to be copied into a bounce buffer. + */ +static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, + struct xdr_buf *xdr, + __be32 *wr_lst) +{ + int elements; + + /* xdr->head */ + elements = 1; + + /* xdr->pages */ + if (!wr_lst) { + unsigned int remaining; + unsigned long pageoff; + + pageoff = xdr->page_base & ~PAGE_MASK; + remaining = xdr->page_len; + while (remaining) { + ++elements; + remaining -= min_t(u32, PAGE_SIZE - pageoff, + remaining); + pageoff = 0; + } + } + + /* xdr->tail */ + if (xdr->tail[0].iov_len) + ++elements; + + /* assume 1 SGE is needed for the transport header */ + return elements >= rdma->sc_max_send_sges; +} + +/* The device is not capable of sending the reply directly. + * Assemble the elements of @xdr into the transport header + * buffer. + */ +static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + struct xdr_buf *xdr, __be32 *wr_lst) +{ + unsigned char *dst, *tailbase; + unsigned int taillen; + + dst = ctxt->sc_xprt_buf; + dst += ctxt->sc_sges[0].length; + + memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len); + dst += xdr->head[0].iov_len; + + tailbase = xdr->tail[0].iov_base; + taillen = xdr->tail[0].iov_len; + if (wr_lst) { + u32 xdrpad; + + xdrpad = xdr_padsize(xdr->page_len); + if (taillen && xdrpad) { + tailbase += xdrpad; + taillen -= xdrpad; + } + } else { + unsigned int len, remaining; + unsigned long pageoff; + struct page **ppages; + + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + pageoff = xdr->page_base & ~PAGE_MASK; + remaining = xdr->page_len; + while (remaining) { + len = min_t(u32, PAGE_SIZE - pageoff, remaining); + + memcpy(dst, page_address(*ppages), len); + remaining -= len; + dst += len; + pageoff = 0; + } + } + + if (taillen) + memcpy(dst, tailbase, taillen); + + ctxt->sc_sges[0].length += xdr->len; + ib_dma_sync_single_for_device(rdma->sc_pd->device, + ctxt->sc_sges[0].addr, + ctxt->sc_sges[0].length, + DMA_TO_DEVICE); + + return 0; +} + /* svc_rdma_map_reply_msg - Map the buffer holding RPC message * @rdma: controlling transport * @ctxt: send_ctxt for the Send WR @@ -559,8 +652,10 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, u32 xdr_pad; int ret; - if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) - return -EIO; + if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst)) + return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst); + + ++ctxt->sc_cur_sge_no; ret = svc_rdma_dma_map_buf(rdma, ctxt, xdr->head[0].iov_base, xdr->head[0].iov_len); @@ -591,8 +686,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, while (remaining) { len = min_t(u32, PAGE_SIZE - page_off, remaining); - if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) - return -EIO; + ++ctxt->sc_cur_sge_no; ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++, page_off, len); if (ret < 0) @@ -606,8 +700,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, len = xdr->tail[0].iov_len; tail: if (len) { - if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) - return -EIO; + ++ctxt->sc_cur_sge_no; ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len); if (ret < 0) return ret; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 924c17d46903..57f86c63a463 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -419,12 +419,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Transport header, head iovec, tail iovec */ newxprt->sc_max_send_sges = 3; /* Add one SGE per page list entry */ - newxprt->sc_max_send_sges += svcrdma_max_req_size / PAGE_SIZE; - if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) { - pr_err("svcrdma: too few Send SGEs available (%d needed)\n", - newxprt->sc_max_send_sges); - goto errout; - } + newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1; + if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) + newxprt->sc_max_send_sges = dev->attrs.max_send_sge; newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = svcrdma_max_requests; newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; -- cgit v1.2.3 From 00670cb8a73b10b10d3c40f045c15411715e4465 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Feb 2019 18:35:15 +0300 Subject: net: dsa: Fix NULL checking in dsa_slave_set_eee() This function can't succeed if dp->pl is NULL. It will Oops inside the call to return phylink_ethtool_get_eee(dp->pl, e); Fixes: 1be52e97ed3e ("dsa: slave: eee: Allow ports to use phylink") Signed-off-by: Dan Carpenter Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/slave.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index b5e44825d173..a1c9fe155057 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -642,7 +642,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!dev->phydev && !dp->pl) + if (!dev->phydev || !dp->pl) return -ENODEV; if (!ds->ops->set_mac_eee) @@ -662,7 +662,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!dev->phydev && !dp->pl) + if (!dev->phydev || !dp->pl) return -ENODEV; if (!ds->ops->get_mac_eee) -- cgit v1.2.3 From 173656accaf583698bac3f9e269884ba60d51ef4 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 7 Feb 2019 18:36:11 +0800 Subject: sit: check if IPv6 enabled before calling ip6_err_gen_icmpv6_unreach() If we disabled IPv6 from the kernel command line (ipv6.disable=1), we should not call ip6_err_gen_icmpv6_unreach(). This: ip link add sit1 type sit local 192.0.2.1 remote 192.0.2.2 ttl 1 ip link set sit1 up ip addr add 198.51.100.1/24 dev sit1 ping 198.51.100.2 if IPv6 is disabled at boot time, will crash the kernel. v2: there's no need to use in6_dev_get(), use __in6_dev_get() instead, as we only need to check that idev exists and we are under rcu_read_lock() (from netif_receive_skb_internal()). Reported-by: Jianlin Shi Fixes: ca15a078bd90 ("sit: generate icmpv6 error when receiving icmpv4 error") Cc: Oussama Ghorbel Signed-off-by: Hangbin Liu Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/sit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 1e03305c0549..e8a1dabef803 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -546,7 +546,8 @@ static int ipip6_err(struct sk_buff *skb, u32 info) } err = 0; - if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len)) + if (__in6_dev_get(skb->dev) && + !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len)) goto out; if (t->parms.iph.daddr == 0) -- cgit v1.2.3 From c09551c6ff7fe16a79a42133bcecba5fc2fc3291 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 6 Feb 2019 19:18:04 +0100 Subject: net: ipv4: use a dedicated counter for icmp_v4 redirect packets According to the algorithm described in the comment block at the beginning of ip_rt_send_redirect, the host should try to send 'ip_rt_redirect_number' ICMP redirect packets with an exponential backoff and then stop sending them at all assuming that the destination ignores redirects. If the device has previously sent some ICMP error packets that are rate-limited (e.g TTL expired) and continues to receive traffic, the redirect packets will never be transmitted. This happens since peer->rate_tokens will be typically greater than 'ip_rt_redirect_number' and so it will never be reset even if the redirect silence timeout (ip_rt_redirect_silence) has elapsed without receiving any packet requiring redirects. Fix it by using a dedicated counter for the number of ICMP redirect packets that has been sent by the host I have not been able to identify a given commit that introduced the issue since ip_rt_send_redirect implements the same rate-limiting algorithm from commit 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- include/net/inetpeer.h | 1 + net/ipv4/inetpeer.c | 1 + net/ipv4/route.c | 7 +++++-- 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 00b5e7825508..74ff688568a0 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -39,6 +39,7 @@ struct inet_peer { u32 metrics[RTAX_MAX]; u32 rate_tokens; /* rate limiting for ICMP */ + u32 n_redirects; unsigned long rate_last; /* * Once inet_peer is queued for deletion (refcnt == 0), following field diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index d757b9642d0d..be778599bfed 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -216,6 +216,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; + p->n_redirects = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first * calculation of tokens is at its maximum. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ce92f73cf104..5163b64f8fb3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -887,13 +887,15 @@ void ip_rt_send_redirect(struct sk_buff *skb) /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) + if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { peer->rate_tokens = 0; + peer->n_redirects = 0; + } /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ - if (peer->rate_tokens >= ip_rt_redirect_number) { + if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; goto out_put_peer; } @@ -910,6 +912,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->rate_tokens; + ++peer->n_redirects; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && peer->rate_tokens == ip_rt_redirect_number) -- cgit v1.2.3 From 225d9464268599a5b4d094d02ec17808e44c7553 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 7 Feb 2019 14:13:18 +0100 Subject: vsock: cope with memory allocation failure at socket creation time In the unlikely event that the kmalloc call in vmci_transport_socket_init() fails, we end-up calling vmci_transport_destruct() with a NULL vmci_trans() and oopsing. This change addresses the above explicitly checking for zero vmci_trans() at destruction time. Reported-by: Xiumei Mu Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Paolo Abeni Reviewed-by: Stefano Garzarella Reviewed-by: Jorgen Hansen Signed-off-by: David S. Miller --- net/vmw_vsock/vmci_transport.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index c361ce782412..c3d5ab01fba7 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1651,6 +1651,10 @@ static void vmci_transport_cleanup(struct work_struct *work) static void vmci_transport_destruct(struct vsock_sock *vsk) { + /* transport can be NULL if we hit a failure at init() time */ + if (!vmci_trans(vsk)) + return; + /* Ensure that the detach callback doesn't use the sk/vsk * we are about to destruct. */ -- cgit v1.2.3 From ccc8ca9b90acb45a3309f922b2591b07b4e070ec Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 7 Feb 2019 14:52:54 +0100 Subject: net/smc: fix byte_order for rx_curs_confirmed The recent change in the rx_curs_confirmed assignment disregards byte order, which causes problems on little endian architectures. This patch fixes it. Fixes: b8649efad879 ("net/smc: fix sender_free computation") (net-tree) Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 4 +--- net/smc/smc_cdc.h | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index a712c9f8699b..fb07ad8d69a6 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -101,9 +101,7 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, - &conn->local_tx_ctrl, conn); - smc_curs_copy(&cfed, &((struct smc_host_cdc_msg *)wr_buf)->cons, conn); + smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); if (!rc) smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 271e2524dc8f..f1cdde9d4b89 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -211,26 +211,27 @@ static inline int smc_curs_diff_large(unsigned int size, static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer, union smc_host_cursor *local, + union smc_host_cursor *save, struct smc_connection *conn) { - union smc_host_cursor temp; - - smc_curs_copy(&temp, local, conn); - peer->count = htonl(temp.count); - peer->wrap = htons(temp.wrap); + smc_curs_copy(save, local, conn); + peer->count = htonl(save->count); + peer->wrap = htons(save->wrap); /* peer->reserved = htons(0); must be ensured by caller */ } static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer, - struct smc_host_cdc_msg *local, - struct smc_connection *conn) + struct smc_connection *conn, + union smc_host_cursor *save) { + struct smc_host_cdc_msg *local = &conn->local_tx_ctrl; + peer->common.type = local->common.type; peer->len = local->len; peer->seqno = htons(local->seqno); peer->token = htonl(local->token); - smc_host_cursor_to_cdc(&peer->prod, &local->prod, conn); - smc_host_cursor_to_cdc(&peer->cons, &local->cons, conn); + smc_host_cursor_to_cdc(&peer->prod, &local->prod, save, conn); + smc_host_cursor_to_cdc(&peer->cons, &local->cons, save, conn); peer->prod_flags = local->prod_flags; peer->conn_state_flags = local->conn_state_flags; } -- cgit v1.2.3 From 4926b51bfaa6d36bd6f398fb7698679d3962e19d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Feb 2019 13:17:12 +0200 Subject: mac80211: call drv_ibss_join() on restart If a driver does any significant activity in its ibss_join method, then it will very well expect that to be called during restart, before any stations are added. Do that. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/util.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index d0eb38b890aa..ba950ae974fc 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -5,7 +5,7 @@ * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -2146,6 +2146,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: break; + case NL80211_IFTYPE_ADHOC: + if (sdata->vif.bss_conf.ibss_joined) + WARN_ON(drv_join_ibss(local, sdata)); + /* fall through */ default: ieee80211_reconfig_stations(sdata); /* fall through */ -- cgit v1.2.3 From ea18709a6f102f975178c21aa7fc4b69eeba1424 Mon Sep 17 00:00:00 2001 From: Aviya Erenfeld Date: Wed, 6 Feb 2019 13:17:08 +0200 Subject: nl80211: Fix FTM per burst maximum value Fix FTM per burst maximum value from 15 to 31 (The maximal bits that represents that number in the frame is 5 hence a maximal value of 31) Signed-off-by: Aviya Erenfeld Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 74150ad95823..d91a408db113 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -250,7 +250,7 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION] = NLA_POLICY_MAX(NLA_U8, 15), [NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST] = - NLA_POLICY_MAX(NLA_U8, 15), + NLA_POLICY_MAX(NLA_U8, 31), [NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES] = { .type = NLA_U8 }, [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC] = { .type = NLA_FLAG }, -- cgit v1.2.3 From ff1bab1ba19165d4402447dd39abae9e21880ebf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Feb 2019 13:17:07 +0200 Subject: cfg80211: pmsr: record netlink port ID Without recording the netlink port ID, we cannot return the results or complete messages to userspace, nor will we be able to abort if the socket is closed, so clearly we need to fill the value. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/pmsr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 78c3f5633692..0216ab555249 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -271,6 +271,7 @@ int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info) req->n_peers = count; req->cookie = cfg80211_assign_cookie(rdev); + req->nl_portid = info->snd_portid; err = rdev_start_pmsr(rdev, wdev, req); if (err) -- cgit v1.2.3 From 1fc9b7253382ce1a83d9a3e63e88d656eb63f263 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Feb 2019 13:17:14 +0200 Subject: cfg80211: prevent speculation on cfg80211_classify8021d() return It's possible that the caller of cfg80211_classify8021d() uses the value to index an array, like mac80211 in ieee80211_downgrade_queue(). Prevent speculation on the return value. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/wireless/util.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index cd48cdd582c0..ec30e3732c7b 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,7 +5,7 @@ * Copyright 2007-2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation */ #include #include @@ -19,6 +19,7 @@ #include #include #include +#include #include "core.h" #include "rdev-ops.h" @@ -715,20 +716,25 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb, { unsigned int dscp; unsigned char vlan_priority; + unsigned int ret; /* skb->priority values from 256->263 are magic values to * directly indicate a specific 802.1d priority. This is used * to allow 802.1d priority to be passed directly in from VLAN * tags, etc. */ - if (skb->priority >= 256 && skb->priority <= 263) - return skb->priority - 256; + if (skb->priority >= 256 && skb->priority <= 263) { + ret = skb->priority - 256; + goto out; + } if (skb_vlan_tag_present(skb)) { vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; - if (vlan_priority > 0) - return vlan_priority; + if (vlan_priority > 0) { + ret = vlan_priority; + goto out; + } } switch (skb->protocol) { @@ -747,8 +753,9 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb, if (!mpls) return 0; - return (ntohl(mpls->entry) & MPLS_LS_TC_MASK) + ret = (ntohl(mpls->entry) & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; + goto out; } case htons(ETH_P_80221): /* 802.21 is always network control traffic */ @@ -761,18 +768,24 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb, unsigned int i, tmp_dscp = dscp >> 2; for (i = 0; i < qos_map->num_des; i++) { - if (tmp_dscp == qos_map->dscp_exception[i].dscp) - return qos_map->dscp_exception[i].up; + if (tmp_dscp == qos_map->dscp_exception[i].dscp) { + ret = qos_map->dscp_exception[i].up; + goto out; + } } for (i = 0; i < 8; i++) { if (tmp_dscp >= qos_map->up[i].low && - tmp_dscp <= qos_map->up[i].high) - return i; + tmp_dscp <= qos_map->up[i].high) { + ret = i; + goto out; + } } } - return dscp >> 5; + ret = dscp >> 5; +out: + return array_index_nospec(ret, IEEE80211_NUM_TIDS); } EXPORT_SYMBOL(cfg80211_classify8021d); -- cgit v1.2.3 From 6157ca0d6bfe437691b1e98a62e2efe12b6714da Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 6 Feb 2019 13:17:21 +0200 Subject: mac80211: Fix Tx aggregation session tear down with ITXQs When mac80211 requests the low level driver to stop an ongoing Tx aggregation, the low level driver is expected to call ieee80211_stop_tx_ba_cb_irqsafe() to indicate that it is ready to stop the session. The callback in turn schedules a worker to complete the session tear down, which in turn also handles the relevant state for the intermediate Tx queue. However, as this flow in asynchronous, the intermediate queue should be stopped and not continue servicing frames, as in such a case frames that are dequeued would be marked as part of an aggregation, although the aggregation is already been stopped. Fix this by stopping the intermediate Tx queue, before calling the low level driver to stop the Tx aggregation. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 69e831bc317b..54821fb1a960 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -8,7 +8,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018 - 2019 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -366,6 +366,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state); + ieee80211_agg_stop_txq(sta, tid); + spin_unlock_bh(&sta->lock); ht_dbg(sta->sdata, "Tx BA session stop requested for %pM tid %u\n", -- cgit v1.2.3 From c4c07b4d6fa1f11880eab8e076d3d060ef3f55fc Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 6 Feb 2019 22:56:15 +0100 Subject: netfilter: nf_nat_snmp_basic: add missing length checks in ASN.1 cbs The generic ASN.1 decoder infrastructure doesn't guarantee that callbacks will get as much data as they expect; callbacks have to check the `datalen` parameter before looking at `data`. Make sure that snmp_version() and snmp_helper() don't read/write beyond the end of the packet data. (Also move the assignment to `pdata` down below the check to make it clear that it isn't necessarily a pointer we can use before the `datalen` check.) Fixes: cc2d58634e0f ("netfilter: nf_nat_snmp_basic: use asn1 decoder library") Signed-off-by: Jann Horn Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_snmp_basic_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c index a0aa13bcabda..0a8a60c1bf9a 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c @@ -105,6 +105,8 @@ static void fast_csum(struct snmp_ctx *ctx, unsigned char offset) int snmp_version(void *context, size_t hdrlen, unsigned char tag, const void *data, size_t datalen) { + if (datalen != 1) + return -EINVAL; if (*(unsigned char *)data > 1) return -ENOTSUPP; return 1; @@ -114,8 +116,11 @@ int snmp_helper(void *context, size_t hdrlen, unsigned char tag, const void *data, size_t datalen) { struct snmp_ctx *ctx = (struct snmp_ctx *)context; - __be32 *pdata = (__be32 *)data; + __be32 *pdata; + if (datalen != 4) + return -EINVAL; + pdata = (__be32 *)data; if (*pdata == ctx->from) { pr_debug("%s: %pI4 to %pI4\n", __func__, (void *)&ctx->from, (void *)&ctx->to); -- cgit v1.2.3 From 8303b7e8f018724a2cd7752eb29c2801fa8c4067 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 8 Feb 2019 16:39:52 +0100 Subject: netfilter: nat: fix spurious connection timeouts Sander Eikelenboom bisected a NAT related regression down to the l4proto->manip_pkt indirection removal. I forgot that ICMP(v6) errors (e.g. PKTTOOBIG) can be set as related to the existing conntrack entry. Therefore, when passing the skb to nf_nat_ipv4/6_manip_pkt(), that ended up calling the wrong l4 manip function, as tuple->dst.protonum is the original flows l4 protocol (TCP, UDP, etc). Set the dst protocol field to ICMP(v6), we already have a private copy of the tuple due to the inversion of src/dst. Reported-by: Sander Eikelenboom Tested-by: Sander Eikelenboom Fixes: faec18dbb0405 ("netfilter: nat: remove l4proto->manip_pkt") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 1 + net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 2687db015b6f..fa2ba7c500e4 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -215,6 +215,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, /* Change outer to look like the reply to an incoming packet */ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + target.dst.protonum = IPPROTO_ICMP; if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip)) return 0; diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 23022447eb49..7a41ee3c11b4 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -226,6 +226,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, } nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + target.dst.protonum = IPPROTO_ICMPV6; if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip)) return 0; -- cgit v1.2.3 From 989723b00b7fecad9acad55b151c73ccf2eac3ca Mon Sep 17 00:00:00 2001 From: Jouke Witteveen Date: Thu, 7 Feb 2019 17:14:32 +0100 Subject: Documentation: bring operstate documentation up-to-date Netlink has moved from bitmasks to group numbers long ago. Signed-off-by: Jouke Witteveen Signed-off-by: David S. Miller --- Documentation/networking/operstates.txt | 14 ++++++++------ net/sched/sch_generic.c | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/Documentation/networking/operstates.txt b/Documentation/networking/operstates.txt index 355c6d8ef8ad..b203d1334822 100644 --- a/Documentation/networking/operstates.txt +++ b/Documentation/networking/operstates.txt @@ -22,8 +22,9 @@ and changeable from userspace under certain rules. 2. Querying from userspace Both admin and operational state can be queried via the netlink -operation RTM_GETLINK. It is also possible to subscribe to RTMGRP_LINK -to be notified of updates. This is important for setting from userspace. +operation RTM_GETLINK. It is also possible to subscribe to RTNLGRP_LINK +to be notified of updates while the interface is admin up. This is +important for setting from userspace. These values contain interface state: @@ -101,8 +102,9 @@ because some driver controlled protocol establishment has to complete. Corresponding functions are netif_dormant_on() to set the flag, netif_dormant_off() to clear it and netif_dormant() to query. -On device allocation, networking core sets the flags equivalent to -netif_carrier_ok() and !netif_dormant(). +On device allocation, both flags __LINK_STATE_NOCARRIER and +__LINK_STATE_DORMANT are cleared, so the effective state is equivalent +to netif_carrier_ok() and !netif_dormant(). Whenever the driver CHANGES one of these flags, a workqueue event is @@ -133,11 +135,11 @@ netif_carrier_ok() && !netif_dormant() is set by the driver. Afterwards, the userspace application can set IFLA_OPERSTATE to IF_OPER_DORMANT or IF_OPER_UP as long as the driver does not set netif_carrier_off() or netif_dormant_on(). Changes made by userspace -are multicasted on the netlink group RTMGRP_LINK. +are multicasted on the netlink group RTNLGRP_LINK. So basically a 802.1X supplicant interacts with the kernel like this: --subscribe to RTMGRP_LINK +-subscribe to RTNLGRP_LINK -set IFLA_LINKMODE to 1 via RTM_SETLINK -query RTM_GETLINK once to get initial state -if initial flags are not (IFF_LOWER_UP && !IFF_DORMANT), wait until diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 66ba2ce2320f..968a85fe4d4a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -500,7 +500,7 @@ static void dev_watchdog_down(struct net_device *dev) * netif_carrier_on - set carrier * @dev: network device * - * Device has detected that carrier. + * Device has detected acquisition of carrier. */ void netif_carrier_on(struct net_device *dev) { -- cgit v1.2.3 From cf657d22ee1f0e887326a92169f2e28dc932fd10 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Feb 2019 12:41:05 -0800 Subject: net/x25: do not hold the cpu too long in x25_new_lci() Due to quadratic behavior of x25_new_lci(), syzbot was able to trigger an rcu stall. Fix this by not blocking BH for the whole duration of the function, and inserting a reschedule point when possible. If we care enough, using a bitmap could get rid of the quadratic behavior. syzbot report : rcu: INFO: rcu_preempt self-detected stall on CPU rcu: 0-...!: (10500 ticks this GP) idle=4fa/1/0x4000000000000002 softirq=283376/283376 fqs=0 rcu: (t=10501 jiffies g=383105 q=136) rcu: rcu_preempt kthread starved for 10502 jiffies! g383105 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 ->cpu=0 rcu: RCU grace-period kthread stack dump: rcu_preempt I28928 10 2 0x80000000 Call Trace: context_switch kernel/sched/core.c:2844 [inline] __schedule+0x817/0x1cc0 kernel/sched/core.c:3485 schedule+0x92/0x180 kernel/sched/core.c:3529 schedule_timeout+0x4db/0xfd0 kernel/time/timer.c:1803 rcu_gp_fqs_loop kernel/rcu/tree.c:1948 [inline] rcu_gp_kthread+0x956/0x17a0 kernel/rcu/tree.c:2105 kthread+0x357/0x430 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 NMI backtrace for cpu 0 CPU: 0 PID: 8759 Comm: syz-executor2 Not tainted 5.0.0-rc4+ #51 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 nmi_cpu_backtrace.cold+0x63/0xa4 lib/nmi_backtrace.c:101 nmi_trigger_cpumask_backtrace+0x1be/0x236 lib/nmi_backtrace.c:62 arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38 trigger_single_cpu_backtrace include/linux/nmi.h:164 [inline] rcu_dump_cpu_stacks+0x183/0x1cf kernel/rcu/tree.c:1211 print_cpu_stall kernel/rcu/tree.c:1348 [inline] check_cpu_stall kernel/rcu/tree.c:1422 [inline] rcu_pending kernel/rcu/tree.c:3018 [inline] rcu_check_callbacks.cold+0x500/0xa4a kernel/rcu/tree.c:2521 update_process_times+0x32/0x80 kernel/time/timer.c:1635 tick_sched_handle+0xa2/0x190 kernel/time/tick-sched.c:161 tick_sched_timer+0x47/0x130 kernel/time/tick-sched.c:1271 __run_hrtimer kernel/time/hrtimer.c:1389 [inline] __hrtimer_run_queues+0x33e/0xde0 kernel/time/hrtimer.c:1451 hrtimer_interrupt+0x314/0x770 kernel/time/hrtimer.c:1509 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1035 [inline] smp_apic_timer_interrupt+0x120/0x570 arch/x86/kernel/apic/apic.c:1060 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:807 RIP: 0010:__read_once_size include/linux/compiler.h:193 [inline] RIP: 0010:queued_write_lock_slowpath+0x13e/0x290 kernel/locking/qrwlock.c:86 Code: 00 00 fc ff df 4c 8d 2c 01 41 83 c7 03 41 0f b6 45 00 41 38 c7 7c 08 84 c0 0f 85 0c 01 00 00 8b 03 3d 00 01 00 00 74 1a f3 90 <41> 0f b6 55 00 41 38 d7 7c eb 84 d2 74 e7 48 89 df e8 6c 0f 4f 00 RSP: 0018:ffff88805f117bd8 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff13 RAX: 0000000000000300 RBX: ffffffff89413ba0 RCX: 1ffffffff1282774 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffffffff89413ba0 RBP: ffff88805f117c70 R08: 1ffffffff1282774 R09: fffffbfff1282775 R10: fffffbfff1282774 R11: ffffffff89413ba3 R12: 00000000000000ff R13: fffffbfff1282774 R14: 1ffff1100be22f7d R15: 0000000000000003 queued_write_lock include/asm-generic/qrwlock.h:104 [inline] do_raw_write_lock+0x1d6/0x290 kernel/locking/spinlock_debug.c:203 __raw_write_lock_bh include/linux/rwlock_api_smp.h:204 [inline] _raw_write_lock_bh+0x3b/0x50 kernel/locking/spinlock.c:312 x25_insert_socket+0x21/0xe0 net/x25/af_x25.c:267 x25_bind+0x273/0x340 net/x25/af_x25.c:705 __sys_bind+0x23f/0x290 net/socket.c:1505 __do_sys_bind net/socket.c:1516 [inline] __se_sys_bind net/socket.c:1514 [inline] __x64_sys_bind+0x73/0xb0 net/socket.c:1514 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457e39 Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fafccd0dc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000031 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457e39 RDX: 0000000000000012 RSI: 0000000020000240 RDI: 0000000000000004 RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fafccd0e6d4 R13: 00000000004bdf8b R14: 00000000004ce4b8 R15: 00000000ffffffff Sending NMI from CPU 0 to CPUs 1: NMI backtrace for cpu 1 CPU: 1 PID: 8752 Comm: syz-executor4 Not tainted 5.0.0-rc4+ #51 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__x25_find_socket+0x78/0x120 net/x25/af_x25.c:328 Code: 89 f8 48 c1 e8 03 80 3c 18 00 0f 85 a6 00 00 00 4d 8b 64 24 68 4d 85 e4 74 7f e8 03 97 3d fb 49 83 ec 68 74 74 e8 f8 96 3d fb <49> 8d bc 24 88 04 00 00 48 89 f8 48 c1 e8 03 0f b6 04 18 84 c0 74 RSP: 0018:ffff8880639efc58 EFLAGS: 00000246 RAX: 0000000000040000 RBX: dffffc0000000000 RCX: ffffc9000e677000 RDX: 0000000000040000 RSI: ffffffff863244b8 RDI: ffff88806a764628 RBP: ffff8880639efc80 R08: ffff8880a80d05c0 R09: fffffbfff1282775 R10: fffffbfff1282774 R11: ffffffff89413ba3 R12: ffff88806a7645c0 R13: 0000000000000001 R14: ffff88809f29ac00 R15: 0000000000000000 FS: 00007fe8d0c58700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b32823000 CR3: 00000000672eb000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: x25_new_lci net/x25/af_x25.c:357 [inline] x25_connect+0x374/0xdf0 net/x25/af_x25.c:786 __sys_connect+0x266/0x330 net/socket.c:1686 __do_sys_connect net/socket.c:1697 [inline] __se_sys_connect net/socket.c:1694 [inline] __x64_sys_connect+0x73/0xb0 net/socket.c:1694 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457e39 Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fe8d0c57c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457e39 RDX: 0000000000000012 RSI: 0000000020000200 RDI: 0000000000000004 RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fe8d0c586d4 R13: 00000000004be378 R14: 00000000004ceb00 R15: 00000000ffffffff Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Andrew Hendry Cc: linux-x25@vger.kernel.org Signed-off-by: David S. Miller --- net/x25/af_x25.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5121729b8b63..ec3a828672ef 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -352,17 +352,15 @@ static unsigned int x25_new_lci(struct x25_neigh *nb) unsigned int lci = 1; struct sock *sk; - read_lock_bh(&x25_list_lock); - - while ((sk = __x25_find_socket(lci, nb)) != NULL) { + while ((sk = x25_find_socket(lci, nb)) != NULL) { sock_put(sk); if (++lci == 4096) { lci = 0; break; } + cond_resched(); } - read_unlock_bh(&x25_list_lock); return lci; } -- cgit v1.2.3 From 8d29d16d21342a0c86405d46de0c4ac5daf1760f Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Sun, 10 Feb 2019 11:58:29 -0800 Subject: netfilter: compat: initialize all fields in xt_init If a non zero value happens to be in xt[NFPROTO_BRIDGE].cur at init time, the following panic can be caused by running % ebtables -t broute -F BROUTING from a 32-bit user level on a 64-bit kernel. This patch replaces kmalloc_array with kcalloc when allocating xt. [ 474.680846] BUG: unable to handle kernel paging request at 0000000009600920 [ 474.687869] PGD 2037006067 P4D 2037006067 PUD 2038938067 PMD 0 [ 474.693838] Oops: 0000 [#1] SMP [ 474.697055] CPU: 9 PID: 4662 Comm: ebtables Kdump: loaded Not tainted 4.19.17-11302235.AroraKernelnext.fc18.x86_64 #1 [ 474.707721] Hardware name: Supermicro X9DRT/X9DRT, BIOS 3.0 06/28/2013 [ 474.714313] RIP: 0010:xt_compat_calc_jump+0x2f/0x63 [x_tables] [ 474.720201] Code: 40 0f b6 ff 55 31 c0 48 6b ff 70 48 03 3d dc 45 00 00 48 89 e5 8b 4f 6c 4c 8b 47 60 ff c9 39 c8 7f 2f 8d 14 08 d1 fa 48 63 fa <41> 39 34 f8 4c 8d 0c fd 00 00 00 00 73 05 8d 42 01 eb e1 76 05 8d [ 474.739023] RSP: 0018:ffffc9000943fc58 EFLAGS: 00010207 [ 474.744296] RAX: 0000000000000000 RBX: ffffc90006465000 RCX: 0000000002580249 [ 474.751485] RDX: 00000000012c0124 RSI: fffffffff7be17e9 RDI: 00000000012c0124 [ 474.758670] RBP: ffffc9000943fc58 R08: 0000000000000000 R09: ffffffff8117cf8f [ 474.765855] R10: ffffc90006477000 R11: 0000000000000000 R12: 0000000000000001 [ 474.773048] R13: 0000000000000000 R14: ffffc9000943fcb8 R15: ffffc9000943fcb8 [ 474.780234] FS: 0000000000000000(0000) GS:ffff88a03f840000(0063) knlGS:00000000f7ac7700 [ 474.788612] CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 [ 474.794632] CR2: 0000000009600920 CR3: 0000002037422006 CR4: 00000000000606e0 [ 474.802052] Call Trace: [ 474.804789] compat_do_replace+0x1fb/0x2a3 [ebtables] [ 474.810105] compat_do_ebt_set_ctl+0x69/0xe6 [ebtables] [ 474.815605] ? try_module_get+0x37/0x42 [ 474.819716] compat_nf_setsockopt+0x4f/0x6d [ 474.824172] compat_ip_setsockopt+0x7e/0x8c [ 474.828641] compat_raw_setsockopt+0x16/0x3a [ 474.833220] compat_sock_common_setsockopt+0x1d/0x24 [ 474.838458] __compat_sys_setsockopt+0x17e/0x1b1 [ 474.843343] ? __check_object_size+0x76/0x19a [ 474.847960] __ia32_compat_sys_socketcall+0x1cb/0x25b [ 474.853276] do_fast_syscall_32+0xaf/0xf6 [ 474.857548] entry_SYSENTER_compat+0x6b/0x7a Signed-off-by: Francesco Ruggeri Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index aecadd471e1d..13e1ac333fa4 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1899,7 +1899,7 @@ static int __init xt_init(void) seqcount_init(&per_cpu(xt_recseq, i)); } - xt = kmalloc_array(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL); + xt = kcalloc(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL); if (!xt) return -ENOMEM; -- cgit v1.2.3 From 7384b538d3aed2ed49d3575483d17aeee790fb06 Mon Sep 17 00:00:00 2001 From: Hoang Le Date: Mon, 11 Feb 2019 09:18:28 +0700 Subject: tipc: fix skb may be leaky in tipc_link_input When we free skb at tipc_data_input, we return a 'false' boolean. Then, skb passed to subcalling tipc_link_input in tipc_link_rcv, 1303 int tipc_link_rcv: ... 1354 if (!tipc_data_input(l, skb, l->inputq)) 1355 rc |= tipc_link_input(l, skb, l->inputq); Fix it by simple changing to a 'true' boolean when skb is being free-ed. Then, tipc_link_rcv will bypassed to subcalling tipc_link_input as above condition. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Hoang Le Signed-off-by: David S. Miller --- net/tipc/link.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 2792a3cae682..7c70034b1073 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1145,7 +1145,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, default: pr_warn("Dropping received illegal msg type\n"); kfree_skb(skb); - return false; + return true; }; } -- cgit v1.2.3 From e75913c93f7cd5f338ab373c34c93a655bd309cb Mon Sep 17 00:00:00 2001 From: Zhiqiang Liu Date: Mon, 11 Feb 2019 10:57:46 +0800 Subject: net: fix IPv6 prefix route residue Follow those steps: # ip addr add 2001:123::1/32 dev eth0 # ip addr add 2001:123:456::2/64 dev eth0 # ip addr del 2001:123::1/32 dev eth0 # ip addr del 2001:123:456::2/64 dev eth0 and then prefix route of 2001:123::1/32 will still exist. This is because ipv6_prefix_equal in check_cleanup_prefix_route func does not check whether two IPv6 addresses have the same prefix length. If the prefix of one address starts with another shorter address prefix, even though their prefix lengths are different, the return value of ipv6_prefix_equal is true. Here I add a check of whether two addresses have the same prefix to decide whether their prefixes are equal. Fixes: 5b84efecb7d9 ("ipv6 addrconf: don't cleanup prefix route for IFA_F_NOPREFIXROUTE") Signed-off-by: Zhiqiang Liu Reported-by: Wenhao Zhang Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 84c358804355..72ffd3d760ff 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1165,7 +1165,8 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires) list_for_each_entry(ifa, &idev->addr_list, if_list) { if (ifa == ifp) continue; - if (!ipv6_prefix_equal(&ifa->addr, &ifp->addr, + if (ifa->prefix_len != ifp->prefix_len || + !ipv6_prefix_equal(&ifa->addr, &ifp->addr, ifp->prefix_len)) continue; if (ifa->flags & (IFA_F_PERMANENT | IFA_F_NOPREFIXROUTE)) -- cgit v1.2.3 From 91986ee166cf0816ae92668476ea7872d51b0c6e Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Mon, 11 Feb 2019 13:29:43 +0700 Subject: tipc: fix link session and re-establish issues When a link endpoint is re-created (e.g. after a node reboot or interface reset), the link session number is varied by random, the peer endpoint will be synced with this new session number before the link is re-established. However, there is a shortcoming in this mechanism that can lead to the link never re-established or faced with a failure then. It happens when the peer endpoint is ready in ESTABLISHING state, the 'peer_session' as well as the 'in_session' flag have been set, but suddenly this link endpoint leaves. When it comes back with a random session number, there are two situations possible: 1/ If the random session number is larger than (or equal to) the previous one, the peer endpoint will be updated with this new session upon receipt of a RESET_MSG from this endpoint, and the link can be re- established as normal. Otherwise, all the RESET_MSGs from this endpoint will be rejected by the peer. In turn, when this link endpoint receives one ACTIVATE_MSG from the peer, it will move to ESTABLISHED and start to send STATE_MSGs, but again these messages will be dropped by the peer due to wrong session. The peer link endpoint can still become ESTABLISHED after receiving a traffic message from this endpoint (e.g. a BCAST_PROTOCOL or NAME_DISTRIBUTOR), but since all the STATE_MSGs are invalid, the link will be forced down sooner or later! Even in case the random session number is larger than the previous one, it can be that the ACTIVATE_MSG from the peer arrives first, and this link endpoint moves quickly to ESTABLISHED without sending out any RESET_MSG yet. Consequently, the peer link will not be updated with the new session number, and the same link failure scenario as above will happen. 2/ Another situation can be that, the peer link endpoint was reset due to any reasons in the meantime, its link state was set to RESET from ESTABLISHING but still in session, i.e. the 'in_session' flag is not reset... Now, if the random session number from this endpoint is less than the previous one, all the RESET_MSGs from this endpoint will be rejected by the peer. In the other direction, when this link endpoint receives a RESET_MSG from the peer, it moves to ESTABLISHING and starts to send ACTIVATE_MSGs, but all these messages will be rejected by the peer too. As a result, the link cannot be re-established but gets stuck with this link endpoint in state ESTABLISHING and the peer in RESET! Solution: =========== This link endpoint should not go directly to ESTABLISHED when getting ACTIVATE_MSG from the peer which may belong to the old session if the link was re-created. To ensure the session to be correct before the link is re-established, the peer endpoint in ESTABLISHING state will send back the last session number in ACTIVATE_MSG for a verification at this endpoint. Then, if needed, a new and more appropriate session number will be regenerated to force a re-synch first. In addition, when a link in ESTABLISHING state is reset, its state will move to RESET according to the link FSM, along with resetting the 'in_session' flag (and the other data) as a normal link reset, it will also be deleted if requested. The solution is backward compatible. Acked-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/link.c | 15 +++++++++++++++ net/tipc/msg.h | 22 ++++++++++++++++++++++ net/tipc/node.c | 11 ++++++----- 3 files changed, 43 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 7c70034b1073..85ad5c0678d0 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1425,6 +1425,10 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, l->rcv_unacked = 0; } else { /* RESET_MSG or ACTIVATE_MSG */ + if (mtyp == ACTIVATE_MSG) { + msg_set_dest_session_valid(hdr, 1); + msg_set_dest_session(hdr, l->peer_session); + } msg_set_max_pkt(hdr, l->advertised_mtu); strcpy(data, l->if_name); msg_set_size(hdr, INT_H_SIZE + TIPC_MAX_IF_NAME); @@ -1642,6 +1646,17 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); break; } + + /* If this endpoint was re-created while peer was ESTABLISHING + * it doesn't know current session number. Force re-synch. + */ + if (mtyp == ACTIVATE_MSG && msg_dest_session_valid(hdr) && + l->session != msg_dest_session(hdr)) { + if (less(l->session, msg_dest_session(hdr))) + l->session = msg_dest_session(hdr) + 1; + break; + } + /* ACTIVATE_MSG serves as PEER_RESET if link is already down */ if (mtyp == RESET_MSG || !link_is_up(l)) rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index a0924956bb61..d7e4b8b93f9d 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -360,6 +360,28 @@ static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n) msg_set_bits(m, 1, 0, 0xffff, n); } +/* Note: reusing bits in word 1 for ACTIVATE_MSG only, to re-synch + * link peer session number + */ +static inline bool msg_dest_session_valid(struct tipc_msg *m) +{ + return msg_bits(m, 1, 16, 0x1); +} + +static inline void msg_set_dest_session_valid(struct tipc_msg *m, bool valid) +{ + msg_set_bits(m, 1, 16, 0x1, valid); +} + +static inline u16 msg_dest_session(struct tipc_msg *m) +{ + return msg_bits(m, 1, 0, 0xffff); +} + +static inline void msg_set_dest_session(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 1, 0, 0xffff, n); +} /* * Word 2 diff --git a/net/tipc/node.c b/net/tipc/node.c index db2a6c3e0be9..2dc4919ab23c 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -830,15 +830,16 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) tipc_node_write_lock(n); if (!tipc_link_is_establishing(l)) { __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); - if (delete) { - kfree(l); - le->link = NULL; - n->link_cnt--; - } } else { /* Defuse pending tipc_node_link_up() */ + tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); } + if (delete) { + kfree(l); + le->link = NULL; + n->link_cnt--; + } trace_tipc_node_link_down(n, true, "node link down or deleted!"); tipc_node_write_unlock(n); if (delete) -- cgit v1.2.3 From 098e13f5b21d3398065fce8780f07a3ef62f4812 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Mon, 11 Feb 2019 16:14:39 +0100 Subject: ipvs: fix dependency on nf_defrag_ipv6 ipvs relies on nf_defrag_ipv6 module to manage IPv6 fragmentation, but lacks proper Kconfig dependencies and does not explicitly request defrag features. As a result, if netfilter hooks are not loaded, when IPv6 fragmented packet are handled by ipvs only the first fragment makes through. Fix it properly declaring the dependency on Kconfig and registering netfilter hooks on ip_vs_add_service() and ip_vs_new_dest(). Reported-by: Li Shuang Signed-off-by: Andrea Claudi Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/Kconfig | 1 + net/netfilter/ipvs/ip_vs_core.c | 10 ++++------ net/netfilter/ipvs/ip_vs_ctl.c | 10 ++++++++++ 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index cad48d07c818..8401cefd9f65 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -29,6 +29,7 @@ config IP_VS_IPV6 bool "IPv6 support for IPVS" depends on IPV6 = y || IP_VS = IPV6 select IP6_NF_IPTABLES + select NF_DEFRAG_IPV6 ---help--- Add IPv6 support to IPVS. diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index fe9abf3cc10a..235205c93e14 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1536,14 +1536,12 @@ ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, /* sorry, all this trouble for a no-hit :) */ IP_VS_DBG_PKT(12, af, pp, skb, iph->off, "ip_vs_in: packet continues traversal as normal"); - if (iph->fragoffs) { - /* Fragment that couldn't be mapped to a conn entry - * is missing module nf_defrag_ipv6 - */ - IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n"); + + /* Fragment couldn't be mapped to a conn entry */ + if (iph->fragoffs) IP_VS_DBG_PKT(7, af, pp, skb, iph->off, "unhandled fragment"); - } + *verdict = NF_ACCEPT; return 0; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7d6318664eb2..86afacb07e5f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -43,6 +43,7 @@ #ifdef CONFIG_IP_VS_IPV6 #include #include +#include #endif #include #include @@ -895,6 +896,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, { struct ip_vs_dest *dest; unsigned int atype, i; + int ret = 0; EnterFunction(2); @@ -905,6 +907,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atype & IPV6_ADDR_LINKLOCAL) && !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) return -EINVAL; + + ret = nf_defrag_ipv6_enable(svc->ipvs->net); + if (ret) + return ret; } else #endif { @@ -1228,6 +1234,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ret = -EINVAL; goto out_err; } + + ret = nf_defrag_ipv6_enable(ipvs->net); + if (ret) + goto out_err; } #endif -- cgit v1.2.3 From d1f20798a119be71746949ba9b2e2ff330fdc038 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 11 Feb 2019 19:32:20 +0800 Subject: ipv6: propagate genlmsg_reply return code genlmsg_reply can fail, so propagate its return code Fixes: 915d7e5e593 ("ipv6: sr: add code base for control plane support of SR-IPv6") Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/ipv6/seg6.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 8d0ba757a46c..9b2f272ca164 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -221,9 +221,7 @@ static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info) rcu_read_unlock(); genlmsg_end(msg, hdr); - genlmsg_reply(msg, info); - - return 0; + return genlmsg_reply(msg, info); nla_put_failure: rcu_read_unlock(); -- cgit v1.2.3 From 4ffcbfac60642f63ae3d80891f573ba7e94a265c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Feb 2019 14:41:22 -0800 Subject: batman-adv: fix uninit-value in batadv_interface_tx() KMSAN reported batadv_interface_tx() was possibly using a garbage value [1] batadv_get_vid() does have a pskb_may_pull() call but batadv_interface_tx() does not actually make sure this did not fail. [1] BUG: KMSAN: uninit-value in batadv_interface_tx+0x908/0x1e40 net/batman-adv/soft-interface.c:231 CPU: 0 PID: 10006 Comm: syz-executor469 Not tainted 4.20.0-rc7+ #5 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x173/0x1d0 lib/dump_stack.c:113 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613 __msan_warning+0x82/0xf0 mm/kmsan/kmsan_instr.c:313 batadv_interface_tx+0x908/0x1e40 net/batman-adv/soft-interface.c:231 __netdev_start_xmit include/linux/netdevice.h:4356 [inline] netdev_start_xmit include/linux/netdevice.h:4365 [inline] xmit_one net/core/dev.c:3257 [inline] dev_hard_start_xmit+0x607/0xc40 net/core/dev.c:3273 __dev_queue_xmit+0x2e42/0x3bc0 net/core/dev.c:3843 dev_queue_xmit+0x4b/0x60 net/core/dev.c:3876 packet_snd net/packet/af_packet.c:2928 [inline] packet_sendmsg+0x8306/0x8f30 net/packet/af_packet.c:2953 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] __sys_sendto+0x8c4/0xac0 net/socket.c:1788 __do_sys_sendto net/socket.c:1800 [inline] __se_sys_sendto+0x107/0x130 net/socket.c:1796 __x64_sys_sendto+0x6e/0x90 net/socket.c:1796 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x441889 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 bb 10 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffdda6fd468 EFLAGS: 00000216 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 0000000000441889 RDX: 000000000000000e RSI: 00000000200000c0 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: 00007ffdda6fd4c0 R13: 00007ffdda6fd4b0 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:204 [inline] kmsan_internal_poison_shadow+0x92/0x150 mm/kmsan/kmsan.c:158 kmsan_kmalloc+0xa6/0x130 mm/kmsan/kmsan_hooks.c:176 kmsan_slab_alloc+0xe/0x10 mm/kmsan/kmsan_hooks.c:185 slab_post_alloc_hook mm/slab.h:446 [inline] slab_alloc_node mm/slub.c:2759 [inline] __kmalloc_node_track_caller+0xe18/0x1030 mm/slub.c:4383 __kmalloc_reserve net/core/skbuff.c:137 [inline] __alloc_skb+0x309/0xa20 net/core/skbuff.c:205 alloc_skb include/linux/skbuff.h:998 [inline] alloc_skb_with_frags+0x1c7/0xac0 net/core/skbuff.c:5220 sock_alloc_send_pskb+0xafd/0x10e0 net/core/sock.c:2083 packet_alloc_skb net/packet/af_packet.c:2781 [inline] packet_snd net/packet/af_packet.c:2872 [inline] packet_sendmsg+0x661a/0x8f30 net/packet/af_packet.c:2953 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg net/socket.c:631 [inline] __sys_sendto+0x8c4/0xac0 net/socket.c:1788 __do_sys_sendto net/socket.c:1800 [inline] __se_sys_sendto+0x107/0x130 net/socket.c:1796 __x64_sys_sendto+0x6e/0x90 net/socket.c:1796 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Marek Lindner Cc: Simon Wunderlich Cc: Antonio Quartulli Signed-off-by: David S. Miller --- net/batman-adv/soft-interface.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index b85ca809e509..ffc83bebfe40 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -227,6 +227,8 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, switch (ntohs(ethhdr->h_proto)) { case ETH_P_8021Q: + if (!pskb_may_pull(skb, sizeof(*vhdr))) + goto dropped; vhdr = vlan_eth_hdr(skb); /* drop batman-in-batman packets to prevent loops */ -- cgit v1.2.3 From 1ec17dbd90f8b638f41ee650558609c1af63dfa0 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 9 Feb 2019 13:35:52 +0300 Subject: inet_diag: fix reporting cgroup classid and fallback to priority Field idiag_ext in struct inet_diag_req_v2 used as bitmap of requested extensions has only 8 bits. Thus extensions starting from DCTCPINFO cannot be requested directly. Some of them included into response unconditionally or hook into some of lower 8 bits. Extension INET_DIAG_CLASS_ID has not way to request from the beginning. This patch bundle it with INET_DIAG_TCLASS (ipv6 tos), fixes space reservation, and documents behavior for other extensions. Also this patch adds fallback to reporting socket priority. This filed is more widely used for traffic classification because ipv4 sockets automatically maps TOS to priority and default qdisc pfifo_fast knows about that. But priority could be changed via setsockopt SO_PRIORITY so INET_DIAG_TOS isn't enough for predicting class. Also cgroup2 obsoletes net_cls classid (it always zero), but we cannot reuse this field for reporting cgroup2 id because it is 64-bit (ino+gen). So, after this patch INET_DIAG_CLASS_ID will report socket priority for most common setup when net_cls isn't set and/or cgroup2 in use. Fixes: 0888e372c37f ("net: inet: diag: expose sockets cgroup classid") Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 16 +++++++++++----- net/ipv4/inet_diag.c | 10 +++++++++- net/sctp/diag.c | 1 + 3 files changed, 21 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 14565d703291..e8baca85bac6 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -137,15 +137,21 @@ enum { INET_DIAG_TCLASS, INET_DIAG_SKMEMINFO, INET_DIAG_SHUTDOWN, - INET_DIAG_DCTCPINFO, - INET_DIAG_PROTOCOL, /* response attribute only */ + + /* + * Next extenstions cannot be requested in struct inet_diag_req_v2: + * its field idiag_ext has only 8 bits. + */ + + INET_DIAG_DCTCPINFO, /* request as INET_DIAG_VEGASINFO */ + INET_DIAG_PROTOCOL, /* response attribute only */ INET_DIAG_SKV6ONLY, INET_DIAG_LOCALS, INET_DIAG_PEERS, INET_DIAG_PAD, - INET_DIAG_MARK, - INET_DIAG_BBRINFO, - INET_DIAG_CLASS_ID, + INET_DIAG_MARK, /* only with CAP_NET_ADMIN */ + INET_DIAG_BBRINFO, /* request as INET_DIAG_VEGASINFO */ + INET_DIAG_CLASS_ID, /* request as INET_DIAG_TCLASS */ INET_DIAG_MD5SIG, __INET_DIAG_MAX, }; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 1a4e9ff02762..5731670c560b 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -108,6 +108,7 @@ static size_t inet_sk_attr_size(struct sock *sk, + nla_total_size(1) /* INET_DIAG_TOS */ + nla_total_size(1) /* INET_DIAG_TCLASS */ + nla_total_size(4) /* INET_DIAG_MARK */ + + nla_total_size(4) /* INET_DIAG_CLASS_ID */ + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(sizeof(struct inet_diag_msg)) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) @@ -287,12 +288,19 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto errout; } - if (ext & (1 << (INET_DIAG_CLASS_ID - 1))) { + if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) || + ext & (1 << (INET_DIAG_TCLASS - 1))) { u32 classid = 0; #ifdef CONFIG_SOCK_CGROUP_DATA classid = sock_cgroup_classid(&sk->sk_cgrp_data); #endif + /* Fallback to socket priority if class id isn't set. + * Classful qdiscs use it as direct reference to class. + * For cgroup2 classid is always zero. + */ + if (!classid) + classid = sk->sk_priority; if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid)) goto errout; diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 078f01a8d582..435847d98b51 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -256,6 +256,7 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc) + nla_total_size(1) /* INET_DIAG_TOS */ + nla_total_size(1) /* INET_DIAG_TCLASS */ + nla_total_size(4) /* INET_DIAG_MARK */ + + nla_total_size(4) /* INET_DIAG_CLASS_ID */ + nla_total_size(addrlen * asoc->peer.transport_count) + nla_total_size(addrlen * addrcnt) + nla_total_size(sizeof(struct inet_diag_meminfo)) -- cgit v1.2.3 From fc62814d690cf62189854464f4bd07457d5e9e50 Mon Sep 17 00:00:00 2001 From: Kal Conley Date: Sun, 10 Feb 2019 09:57:11 +0100 Subject: net/packet: fix 4gb buffer limit due to overflow check When calculating rb->frames_per_block * req->tp_block_nr the result can overflow. Check it for overflow without limiting the total buffer size to UINT_MAX. This change fixes support for packet ring buffers >= UINT_MAX. Fixes: 8f8d28e4d6d8 ("net/packet: fix overflow in check for tp_frame_nr") Signed-off-by: Kal Conley Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3b1a78906bc0..1cd1d83a4be0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4292,7 +4292,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, rb->frames_per_block = req->tp_block_size / req->tp_frame_size; if (unlikely(rb->frames_per_block == 0)) goto out; - if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr)) + if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr)) goto out; if (unlikely((rb->frames_per_block * req->tp_block_nr) != req->tp_frame_nr)) -- cgit v1.2.3 From 8015d93ebd27484418d4952284fd02172fa4b0b2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 11 Feb 2019 13:06:14 -0800 Subject: net_sched: fix a race condition in tcindex_destroy() tcindex_destroy() invokes tcindex_destroy_element() via a walker to delete each filter result in its perfect hash table, and tcindex_destroy_element() calls tcindex_delete() which schedules tcf RCU works to do the final deletion work. Unfortunately this races with the RCU callback __tcindex_destroy(), which could lead to use-after-free as reported by Adrian. Fix this by migrating this RCU callback to tcf RCU work too, as that workqueue is ordered, we will not have use-after-free. Note, we don't need to hold netns refcnt because we don't call tcf_exts_destroy() here. Fixes: 27ce4f05e2ab ("net_sched: use tcf_queue_work() in tcindex filter") Reported-by: Adrian Cc: Ben Hutchings Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 9ccc93f257db..79b52a637dda 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -48,7 +48,7 @@ struct tcindex_data { u32 hash; /* hash table size; 0 if undefined */ u32 alloc_hash; /* allocated size */ u32 fall_through; /* 0: only classify if explicit match */ - struct rcu_head rcu; + struct rcu_work rwork; }; static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) @@ -229,9 +229,11 @@ static int tcindex_destroy_element(struct tcf_proto *tp, return tcindex_delete(tp, arg, &last, NULL); } -static void __tcindex_destroy(struct rcu_head *head) +static void tcindex_destroy_work(struct work_struct *work) { - struct tcindex_data *p = container_of(head, struct tcindex_data, rcu); + struct tcindex_data *p = container_of(to_rcu_work(work), + struct tcindex_data, + rwork); kfree(p->perfect); kfree(p->h); @@ -258,9 +260,11 @@ static int tcindex_filter_result_init(struct tcindex_filter_result *r) return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } -static void __tcindex_partial_destroy(struct rcu_head *head) +static void tcindex_partial_destroy_work(struct work_struct *work) { - struct tcindex_data *p = container_of(head, struct tcindex_data, rcu); + struct tcindex_data *p = container_of(to_rcu_work(work), + struct tcindex_data, + rwork); kfree(p->perfect); kfree(p); @@ -478,7 +482,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } if (oldp) - call_rcu(&oldp->rcu, __tcindex_partial_destroy); + tcf_queue_work(&oldp->rwork, tcindex_partial_destroy_work); return 0; errout_alloc: @@ -570,7 +574,7 @@ static void tcindex_destroy(struct tcf_proto *tp, walker.fn = tcindex_destroy_element; tcindex_walk(tp, &walker); - call_rcu(&p->rcu, __tcindex_destroy); + tcf_queue_work(&p->rwork, tcindex_destroy_work); } -- cgit v1.2.3 From 033b228e7f26b29ae37f8bfa1bc6b209a5365e9f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 11 Feb 2019 13:06:15 -0800 Subject: net_sched: fix a memory leak in cls_tcindex When tcindex_destroy() destroys all the filter results in the perfect hash table, it invokes the walker to delete each of them. However, results with class==0 are skipped in either tcindex_walk() or tcindex_delete(), which causes a memory leak reported by kmemleak. This patch fixes it by skipping the walker and directly deleting these filter results so we don't miss any filter result. As a result of this change, we have to initialize exts->net properly in tcindex_alloc_perfect_hash(). For net-next, we need to consider whether we should initialize ->net in tcf_exts_init() instead, before that just directly test CONFIG_NET_CLS_ACT=y. Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 79b52a637dda..70ea5b1a7889 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -221,14 +221,6 @@ found: return 0; } -static int tcindex_destroy_element(struct tcf_proto *tp, - void *arg, struct tcf_walker *walker) -{ - bool last; - - return tcindex_delete(tp, arg, &last, NULL); -} - static void tcindex_destroy_work(struct work_struct *work) { struct tcindex_data *p = container_of(to_rcu_work(work), @@ -279,7 +271,7 @@ static void tcindex_free_perfect_hash(struct tcindex_data *cp) kfree(cp->perfect); } -static int tcindex_alloc_perfect_hash(struct tcindex_data *cp) +static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) { int i, err = 0; @@ -293,6 +285,9 @@ static int tcindex_alloc_perfect_hash(struct tcindex_data *cp) TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) goto errout; +#ifdef CONFIG_NET_CLS_ACT + cp->perfect[i].exts.net = net; +#endif } return 0; @@ -341,7 +336,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (p->perfect) { int i; - if (tcindex_alloc_perfect_hash(cp) < 0) + if (tcindex_alloc_perfect_hash(net, cp) < 0) goto errout; for (i = 0; i < cp->hash; i++) cp->perfect[i].res = p->perfect[i].res; @@ -410,7 +405,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = -ENOMEM; if (!cp->perfect && !cp->h) { if (valid_perfect_hash(cp)) { - if (tcindex_alloc_perfect_hash(cp) < 0) + if (tcindex_alloc_perfect_hash(net, cp) < 0) goto errout_alloc; balloc = 1; } else { @@ -566,13 +561,32 @@ static void tcindex_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcf_walker walker; + int i; pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); - walker.count = 0; - walker.skip = 0; - walker.fn = tcindex_destroy_element; - tcindex_walk(tp, &walker); + + if (p->perfect) { + for (i = 0; i < p->hash; i++) { + struct tcindex_filter_result *r = p->perfect + i; + + tcf_unbind_filter(tp, &r->res); + if (tcf_exts_get_net(&r->exts)) + tcf_queue_work(&r->rwork, + tcindex_destroy_rexts_work); + else + __tcindex_destroy_rexts(r); + } + } + + for (i = 0; p->h && i < p->hash; i++) { + struct tcindex_filter *f, *next; + bool last; + + for (f = rtnl_dereference(p->h[i]); f; f = next) { + next = rtnl_dereference(f->next); + tcindex_delete(tp, &f->result, &last, NULL); + } + } tcf_queue_work(&p->rwork, tcindex_destroy_work); } -- cgit v1.2.3 From 1db817e75f5b9387b8db11e37d5f0624eb9223e0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 11 Feb 2019 13:06:16 -0800 Subject: net_sched: fix two more memory leaks in cls_tcindex struct tcindex_filter_result contains two parts: struct tcf_exts and struct tcf_result. For the local variable 'cr', its exts part is never used but initialized without being released properly on success path. So just completely remove the exts part to fix this leak. For the local variable 'new_filter_result', it is never properly released if not used by 'r' on success path. Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 70ea5b1a7889..38bb882bb958 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -304,9 +304,9 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct nlattr *est, bool ovr, struct netlink_ext_ack *extack) { struct tcindex_filter_result new_filter_result, *old_r = r; - struct tcindex_filter_result cr; struct tcindex_data *cp = NULL, *oldp; struct tcindex_filter *f = NULL; /* make gcc behave */ + struct tcf_result cr = {}; int err, balloc = 0; struct tcf_exts e; @@ -345,13 +345,10 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, cp->h = p->h; err = tcindex_filter_result_init(&new_filter_result); - if (err < 0) - goto errout1; - err = tcindex_filter_result_init(&cr); if (err < 0) goto errout1; if (old_r) - cr.res = r->res; + cr = r->res; if (tb[TCA_TCINDEX_HASH]) cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); @@ -442,8 +439,8 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } if (tb[TCA_TCINDEX_CLASSID]) { - cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]); - tcf_bind_filter(tp, &cr.res, base); + cr.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]); + tcf_bind_filter(tp, &cr, base); } if (old_r && old_r != r) { @@ -455,7 +452,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } oldp = p; - r->res = cr.res; + r->res = cr; tcf_exts_change(&r->exts, &e); rcu_assign_pointer(tp->root, cp); @@ -474,6 +471,8 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, ; /* nothing */ rcu_assign_pointer(*fp, f); + } else { + tcf_exts_destroy(&new_filter_result.exts); } if (oldp) @@ -486,7 +485,6 @@ errout_alloc: else if (balloc == 2) kfree(cp->h); errout1: - tcf_exts_destroy(&cr.exts); tcf_exts_destroy(&new_filter_result.exts); errout: kfree(cp); -- cgit v1.2.3 From a4cb5bdb754afe21f3e9e7164213e8600cf69427 Mon Sep 17 00:00:00 2001 From: Nicolas Morey-Chaisemartin Date: Tue, 5 Feb 2019 18:21:02 +0100 Subject: xprtrdma: Make sure Send CQ is allocated on an existing compvec Make sure the device has at least 2 completion vectors before allocating to compvec#1 Fixes: a4699f5647f3 (xprtrdma: Put Send CQ in IB_POLL_WORKQUEUE mode) Signed-off-by: Nicolas Morey-Chaisemartin Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 4994e75945b8..21113bfd4eca 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -527,7 +527,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, - 1, IB_POLL_WORKQUEUE); + ia->ri_device->num_comp_vectors > 1 ? 1 : 0, + IB_POLL_WORKQUEUE); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); goto out1; -- cgit v1.2.3 From ad6fef776927b4172e55a1bd97b3cd441a0c261c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 12 Feb 2019 19:27:34 +0100 Subject: rpc: properly check debugfs dentry before using it debugfs can now report an error code if something went wrong instead of just NULL. So if the return value is to be used as a "real" dentry, it needs to be checked if it is an error before dereferencing it. This is now happening because of ff9fb72bc077 ("debugfs: return error values, not NULL"), but why debugfs files are not being created properly is an older issue, probably one that has always been there and should probably be looked at... Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Trond Myklebust Cc: Anna Schumaker Cc: linux-nfs@vger.kernel.org Cc: netdev@vger.kernel.org Reported-by: David Howells Tested-by: David Howells Signed-off-by: Greg Kroah-Hartman Signed-off-by: Anna Schumaker --- net/sunrpc/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 45a033329cd4..19bb356230ed 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -146,7 +146,7 @@ rpc_clnt_debugfs_register(struct rpc_clnt *clnt) rcu_read_lock(); xprt = rcu_dereference(clnt->cl_xprt); /* no "debugfs" dentry? Don't bother with the symlink. */ - if (!xprt->debugfs) { + if (IS_ERR_OR_NULL(xprt->debugfs)) { rcu_read_unlock(); return; } -- cgit v1.2.3 From 753c111f655e38bbd52fc01321266633f022ebe2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 13 Feb 2019 13:03:53 +0100 Subject: netfilter: nft_compat: use-after-free when deleting targets Fetch pointer to module before target object is released. Fixes: 29e3880109e3 ("netfilter: nf_tables: fix use-after-free when deleting compat expressions") Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index fe64df848365..0a4bad55a8aa 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -315,6 +315,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; void *info = nft_expr_priv(expr); + struct module *me = target->me; struct xt_tgdtor_param par; par.net = ctx->net; @@ -325,7 +326,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) par.target->destroy(&par); if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops))) - module_put(target->me); + module_put(me); } static int nft_extension_dump_info(struct sk_buff *skb, int attr, -- cgit v1.2.3 From fc228abc2347e106a44c0e9b29ab70b712c4ca51 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 12 Feb 2019 18:47:30 +0800 Subject: sctp: call gso_reset_checksum when computing checksum in sctp_gso_segment Jianlin reported a panic when running sctp gso over gre over vlan device: [ 84.772930] RIP: 0010:do_csum+0x6d/0x170 [ 84.790605] Call Trace: [ 84.791054] csum_partial+0xd/0x20 [ 84.791657] gre_gso_segment+0x2c3/0x390 [ 84.792364] inet_gso_segment+0x161/0x3e0 [ 84.793071] skb_mac_gso_segment+0xb8/0x120 [ 84.793846] __skb_gso_segment+0x7e/0x180 [ 84.794581] validate_xmit_skb+0x141/0x2e0 [ 84.795297] __dev_queue_xmit+0x258/0x8f0 [ 84.795949] ? eth_header+0x26/0xc0 [ 84.796581] ip_finish_output2+0x196/0x430 [ 84.797295] ? skb_gso_validate_network_len+0x11/0x80 [ 84.798183] ? ip_finish_output+0x169/0x270 [ 84.798875] ip_output+0x6c/0xe0 [ 84.799413] ? ip_append_data.part.50+0xc0/0xc0 [ 84.800145] iptunnel_xmit+0x144/0x1c0 [ 84.800814] ip_tunnel_xmit+0x62d/0x930 [ip_tunnel] [ 84.801699] gre_tap_xmit+0xac/0xf0 [ip_gre] [ 84.802395] dev_hard_start_xmit+0xa5/0x210 [ 84.803086] sch_direct_xmit+0x14f/0x340 [ 84.803733] __dev_queue_xmit+0x799/0x8f0 [ 84.804472] ip_finish_output2+0x2e0/0x430 [ 84.805255] ? skb_gso_validate_network_len+0x11/0x80 [ 84.806154] ip_output+0x6c/0xe0 [ 84.806721] ? ip_append_data.part.50+0xc0/0xc0 [ 84.807516] sctp_packet_transmit+0x716/0xa10 [sctp] [ 84.808337] sctp_outq_flush+0xd7/0x880 [sctp] It was caused by SKB_GSO_CB(skb)->csum_start not set in sctp_gso_segment. sctp_gso_segment() calls skb_segment() with 'feature | NETIF_F_HW_CSUM', which causes SKB_GSO_CB(skb)->csum_start not to be set in skb_segment(). For TCP/UDP, when feature supports HW_CSUM, CHECKSUM_PARTIAL will be set and gso_reset_checksum will be called to set SKB_GSO_CB(skb)->csum_start. So SCTP should do the same as TCP/UDP, to call gso_reset_checksum() when computing checksum in sctp_gso_segment. Reported-by: Jianlin Shi Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/offload.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 123e9f2dc226..edfcf16e704c 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -36,6 +36,7 @@ static __le32 sctp_gso_make_checksum(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; skb->csum_not_inet = 0; + gso_reset_checksum(skb, ~0); return sctp_compute_cksum(skb, skb_transport_offset(skb)); } -- cgit v1.2.3 From af98c5a78517c04adb5fd68bb64b1ad6fe3d473f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 12 Feb 2019 18:51:01 +0800 Subject: sctp: set stream ext to NULL after freeing it in sctp_stream_outq_migrate In sctp_stream_init(), after sctp_stream_outq_migrate() freed the surplus streams' ext, but sctp_stream_alloc_out() returns -ENOMEM, stream->outcnt will not be set to 'outcnt'. With the bigger value on stream->outcnt, when closing the assoc and freeing its streams, the ext of those surplus streams will be freed again since those stream exts were not set to NULL after freeing in sctp_stream_outq_migrate(). Then the invalid-free issue reported by syzbot would be triggered. We fix it by simply setting them to NULL after freeing. Fixes: 5bbbbe32a431 ("sctp: introduce stream scheduler foundations") Reported-by: syzbot+58e480e7b28f2d890bfd@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index f24633114dfd..2936ed17bf9e 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -144,8 +144,10 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream, } } - for (i = outcnt; i < stream->outcnt; i++) + for (i = outcnt; i < stream->outcnt; i++) { kfree(SCTP_SO(stream, i)->ext); + SCTP_SO(stream, i)->ext = NULL; + } } static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, -- cgit v1.2.3 From 5bf325a53202b8728cf7013b72688c46071e212e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Feb 2019 12:26:27 -0800 Subject: net: fix possible overflow in __sk_mem_raise_allocated() With many active TCP sockets, fat TCP sockets could fool __sk_mem_raise_allocated() thanks to an overflow. They would increase their share of the memory, instead of decreasing it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- net/core/sock.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 2b229f7be8eb..f43f935cb113 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1277,7 +1277,7 @@ static inline void sk_sockets_allocated_inc(struct sock *sk) percpu_counter_inc(sk->sk_prot->sockets_allocated); } -static inline int +static inline u64 sk_sockets_allocated_read_positive(struct sock *sk) { return percpu_counter_read_positive(sk->sk_prot->sockets_allocated); diff --git a/net/core/sock.c b/net/core/sock.c index 6aa2e7e0b4fb..bc3512f230a3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2380,7 +2380,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) } if (sk_has_memory_pressure(sk)) { - int alloc; + u64 alloc; if (!sk_under_memory_pressure(sk)) return 1; -- cgit v1.2.3 From e7afe6c1d486b516ed586dcc10b3e7e3e85a9c2b Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 15 Feb 2019 13:42:02 -0500 Subject: sunrpc: fix 4 more call sites that were using stack memory with a scatterlist While trying to reproduce a reported kernel panic on arm64, I discovered that AUTH_GSS basically doesn't work at all with older enctypes on arm64 systems with CONFIG_VMAP_STACK enabled. It turns out there still a few places using stack memory with scatterlists, causing krb5_encrypt() and krb5_decrypt() to produce incorrect results (or a BUG if CONFIG_DEBUG_SG is enabled). Tested with cthon on v4.0/v4.1/v4.2 with krb5/krb5i/krb5p using des3-cbc-sha1 and arcfour-hmac-md5. Signed-off-by: Scott Mayhew Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_krb5_seqnum.c | 49 +++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c index fb6656295204..507105127095 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c +++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c @@ -44,7 +44,7 @@ krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, unsigned char *cksum, unsigned char *buf) { struct crypto_sync_skcipher *cipher; - unsigned char plain[8]; + unsigned char *plain; s32 code; dprintk("RPC: %s:\n", __func__); @@ -52,6 +52,10 @@ krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, if (IS_ERR(cipher)) return PTR_ERR(cipher); + plain = kmalloc(8, GFP_NOFS); + if (!plain) + return -ENOMEM; + plain[0] = (unsigned char) ((seqnum >> 24) & 0xff); plain[1] = (unsigned char) ((seqnum >> 16) & 0xff); plain[2] = (unsigned char) ((seqnum >> 8) & 0xff); @@ -67,6 +71,7 @@ krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum, code = krb5_encrypt(cipher, cksum, plain, buf, 8); out: + kfree(plain); crypto_free_sync_skcipher(cipher); return code; } @@ -77,12 +82,17 @@ krb5_make_seq_num(struct krb5_ctx *kctx, u32 seqnum, unsigned char *cksum, unsigned char *buf) { - unsigned char plain[8]; + unsigned char *plain; + s32 code; if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) return krb5_make_rc4_seq_num(kctx, direction, seqnum, cksum, buf); + plain = kmalloc(8, GFP_NOFS); + if (!plain) + return -ENOMEM; + plain[0] = (unsigned char) (seqnum & 0xff); plain[1] = (unsigned char) ((seqnum >> 8) & 0xff); plain[2] = (unsigned char) ((seqnum >> 16) & 0xff); @@ -93,7 +103,9 @@ krb5_make_seq_num(struct krb5_ctx *kctx, plain[6] = direction; plain[7] = direction; - return krb5_encrypt(key, cksum, plain, buf, 8); + code = krb5_encrypt(key, cksum, plain, buf, 8); + kfree(plain); + return code; } static s32 @@ -101,7 +113,7 @@ krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, unsigned char *buf, int *direction, s32 *seqnum) { struct crypto_sync_skcipher *cipher; - unsigned char plain[8]; + unsigned char *plain; s32 code; dprintk("RPC: %s:\n", __func__); @@ -113,20 +125,28 @@ krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum, if (code) goto out; + plain = kmalloc(8, GFP_NOFS); + if (!plain) { + code = -ENOMEM; + goto out; + } + code = krb5_decrypt(cipher, cksum, buf, plain, 8); if (code) - goto out; + goto out_plain; if ((plain[4] != plain[5]) || (plain[4] != plain[6]) || (plain[4] != plain[7])) { code = (s32)KG_BAD_SEQ; - goto out; + goto out_plain; } *direction = plain[4]; *seqnum = ((plain[0] << 24) | (plain[1] << 16) | (plain[2] << 8) | (plain[3])); +out_plain: + kfree(plain); out: crypto_free_sync_skcipher(cipher); return code; @@ -139,7 +159,7 @@ krb5_get_seq_num(struct krb5_ctx *kctx, int *direction, u32 *seqnum) { s32 code; - unsigned char plain[8]; + unsigned char *plain; struct crypto_sync_skcipher *key = kctx->seq; dprintk("RPC: krb5_get_seq_num:\n"); @@ -147,18 +167,25 @@ krb5_get_seq_num(struct krb5_ctx *kctx, if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) return krb5_get_rc4_seq_num(kctx, cksum, buf, direction, seqnum); + plain = kmalloc(8, GFP_NOFS); + if (!plain) + return -ENOMEM; if ((code = krb5_decrypt(key, cksum, buf, plain, 8))) - return code; + goto out; if ((plain[4] != plain[5]) || (plain[4] != plain[6]) || - (plain[4] != plain[7])) - return (s32)KG_BAD_SEQ; + (plain[4] != plain[7])) { + code = (s32)KG_BAD_SEQ; + goto out; + } *direction = plain[4]; *seqnum = ((plain[0]) | (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24)); - return 0; +out: + kfree(plain); + return code; } -- cgit v1.2.3