diff options
Diffstat (limited to 'net')
75 files changed, 1601 insertions, 2644 deletions
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index ab3b654b05cc..4e2576fc0c59 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -273,9 +273,6 @@ static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, struct lowpan_peer *peer) { const u8 *saddr; - struct lowpan_btle_dev *dev; - - dev = lowpan_btle_dev(netdev); saddr = peer->lladdr; @@ -618,12 +615,8 @@ static void ifup(struct net_device *netdev) static void ifdown(struct net_device *netdev) { - int err; - rtnl_lock(); - err = dev_close(netdev); - if (err < 0) - BT_INFO("iface %s cannot be closed (%d)", netdev->name, err); + dev_close(netdev); rtnl_unlock(); } diff --git a/net/core/Makefile b/net/core/Makefile index 79f9479e9658..d501c4278015 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,7 +11,6 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o -obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o diff --git a/net/core/dev.c b/net/core/dev.c index 8515f8fe0460..8ea6b4b42611 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -144,6 +144,7 @@ #include <linux/netfilter_ingress.h> #include <linux/crash_dump.h> #include <linux/sctp.h> +#include <net/udp_tunnel.h> #include "net-sysfs.h" @@ -1413,7 +1414,7 @@ int dev_open(struct net_device *dev) } EXPORT_SYMBOL(dev_open); -static int __dev_close_many(struct list_head *head) +static void __dev_close_many(struct list_head *head) { struct net_device *dev; @@ -1455,23 +1456,18 @@ static int __dev_close_many(struct list_head *head) dev->flags &= ~IFF_UP; netpoll_poll_enable(dev); } - - return 0; } -static int __dev_close(struct net_device *dev) +static void __dev_close(struct net_device *dev) { - int retval; LIST_HEAD(single); list_add(&dev->close_list, &single); - retval = __dev_close_many(&single); + __dev_close_many(&single); list_del(&single); - - return retval; } -int dev_close_many(struct list_head *head, bool unlink) +void dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; @@ -1488,8 +1484,6 @@ int dev_close_many(struct list_head *head, bool unlink) if (unlink) list_del_init(&dev->close_list); } - - return 0; } EXPORT_SYMBOL(dev_close_many); @@ -1502,7 +1496,7 @@ EXPORT_SYMBOL(dev_close_many); * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier * chain. */ -int dev_close(struct net_device *dev) +void dev_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); @@ -1511,7 +1505,6 @@ int dev_close(struct net_device *dev) dev_close_many(&single, true); list_del(&single); } - return 0; } EXPORT_SYMBOL(dev_close); @@ -3865,6 +3858,121 @@ drop: return NET_RX_DROP; } +static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct xdp_buff xdp; + u32 act = XDP_DROP; + void *orig_data; + int hlen, off; + u32 mac_len; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_cloned(skb)) + return XDP_PASS; + + if (skb_linearize(skb)) + goto do_drop; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hlen = skb_headlen(skb) + mac_len; + xdp.data = skb->data - mac_len; + xdp.data_end = xdp.data + hlen; + xdp.data_hard_start = skb->data - skb_headroom(skb); + orig_data = xdp.data; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + off = xdp.data - orig_data; + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + __skb_push(skb, mac_len); + /* fall through */ + case XDP_PASS: + break; + + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(skb->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + do_drop: + kfree_skb(skb); + break; + } + + return act; +} + +/* When doing generic XDP we have to bypass the qdisc layer and the + * network taps in order to match in-driver-XDP behavior. + */ +static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + bool free_skb = true; + int cpu, rc; + + txq = netdev_pick_tx(dev, skb, NULL); + cpu = smp_processor_id(); + HARD_TX_LOCK(dev, txq, cpu); + if (!netif_xmit_stopped(txq)) { + rc = netdev_start_xmit(skb, dev, txq, 0); + if (dev_xmit_complete(rc)) + free_skb = false; + } + HARD_TX_UNLOCK(dev, txq); + if (free_skb) { + trace_xdp_exception(dev, xdp_prog, XDP_TX); + kfree_skb(skb); + } +} + +static struct static_key generic_xdp_needed __read_mostly; + +static int do_xdp_generic(struct sk_buff *skb) +{ + struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); + + if (xdp_prog) { + u32 act = netif_receive_generic_xdp(skb, xdp_prog); + int err; + + if (act != XDP_PASS) { + switch (act) { + case XDP_REDIRECT: + err = xdp_do_generic_redirect(skb->dev, skb); + if (err) + goto out_redir; + /* fallthru to submit skb */ + case XDP_TX: + generic_xdp_tx(skb, xdp_prog); + break; + } + return XDP_DROP; + } + } + return XDP_PASS; +out_redir: + trace_xdp_exception(skb->dev, xdp_prog, XDP_REDIRECT); + kfree_skb(skb); + return XDP_DROP; +} + static int netif_rx_internal(struct sk_buff *skb) { int ret; @@ -3872,6 +3980,18 @@ static int netif_rx_internal(struct sk_buff *skb) net_timestamp_check(netdev_tstamp_prequeue, skb); trace_netif_rx(skb); + + if (static_key_false(&generic_xdp_needed)) { + int ret = do_xdp_generic(skb); + + /* Consider XDP consuming the packet a success from + * the netdev point of view we do not want to count + * this as an error. + */ + if (ret != XDP_PASS) + return NET_RX_SUCCESS; + } + #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -4338,8 +4458,6 @@ static int __netif_receive_skb(struct sk_buff *skb) return ret; } -static struct static_key generic_xdp_needed __read_mostly; - static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); @@ -4373,89 +4491,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) return ret; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, - struct bpf_prog *xdp_prog) -{ - struct xdp_buff xdp; - u32 act = XDP_DROP; - void *orig_data; - int hlen, off; - u32 mac_len; - - /* Reinjected packets coming from act_mirred or similar should - * not get XDP generic processing. - */ - if (skb_cloned(skb)) - return XDP_PASS; - - if (skb_linearize(skb)) - goto do_drop; - - /* The XDP program wants to see the packet starting at the MAC - * header. - */ - mac_len = skb->data - skb_mac_header(skb); - hlen = skb_headlen(skb) + mac_len; - xdp.data = skb->data - mac_len; - xdp.data_end = xdp.data + hlen; - xdp.data_hard_start = skb->data - skb_headroom(skb); - orig_data = xdp.data; - - act = bpf_prog_run_xdp(xdp_prog, &xdp); - - off = xdp.data - orig_data; - if (off > 0) - __skb_pull(skb, off); - else if (off < 0) - __skb_push(skb, -off); - - switch (act) { - case XDP_TX: - __skb_push(skb, mac_len); - /* fall through */ - case XDP_PASS: - break; - - default: - bpf_warn_invalid_xdp_action(act); - /* fall through */ - case XDP_ABORTED: - trace_xdp_exception(skb->dev, xdp_prog, act); - /* fall through */ - case XDP_DROP: - do_drop: - kfree_skb(skb); - break; - } - - return act; -} - -/* When doing generic XDP we have to bypass the qdisc layer and the - * network taps in order to match in-driver-XDP behavior. - */ -static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) -{ - struct net_device *dev = skb->dev; - struct netdev_queue *txq; - bool free_skb = true; - int cpu, rc; - - txq = netdev_pick_tx(dev, skb, NULL); - cpu = smp_processor_id(); - HARD_TX_LOCK(dev, txq, cpu); - if (!netif_xmit_stopped(txq)) { - rc = netdev_start_xmit(skb, dev, txq, 0); - if (dev_xmit_complete(rc)) - free_skb = false; - } - HARD_TX_UNLOCK(dev, txq); - if (free_skb) { - trace_xdp_exception(dev, xdp_prog, XDP_TX); - kfree_skb(skb); - } -} - static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; @@ -4468,17 +4503,11 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); if (static_key_false(&generic_xdp_needed)) { - struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); + int ret = do_xdp_generic(skb); - if (xdp_prog) { - u32 act = netif_receive_generic_xdp(skb, xdp_prog); - - if (act != XDP_PASS) { - rcu_read_unlock(); - if (act == XDP_TX) - generic_xdp_tx(skb, xdp_prog); - return NET_RX_DROP; - } + if (ret != XDP_PASS) { + rcu_read_unlock(); + return NET_RX_DROP; } } @@ -6689,8 +6718,12 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) */ ret = 0; - if ((old_flags ^ flags) & IFF_UP) - ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); + if ((old_flags ^ flags) & IFF_UP) { + if (old_flags & IFF_UP) + __dev_close(dev); + else + ret = __dev_open(dev); + } if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; @@ -7235,24 +7268,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_GSO; } - /* UFO needs SG and checksumming */ - if (features & NETIF_F_UFO) { - /* maybe split UFO into V4 and V6? */ - if (!(features & NETIF_F_HW_CSUM) && - ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != - (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no checksum offload features.\n"); - features &= ~NETIF_F_UFO; - } - - if (!(features & NETIF_F_SG)) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); - features &= ~NETIF_F_UFO; - } - } - /* GSO partial features require GSO partial be set */ if ((features & dev->gso_partial_features) && !(features & NETIF_F_GSO_PARTIAL)) { @@ -7313,8 +7328,27 @@ sync_lower: netdev_for_each_lower_dev(dev, lower, iter) netdev_sync_lower_features(dev, lower, features); - if (!err) + if (!err) { + netdev_features_t diff = features ^ dev->features; + + if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { + /* udp_tunnel_{get,drop}_rx_info both need + * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the + * device, or they won't do anything. + * Thus we need to update dev->features + * *before* calling udp_tunnel_get_rx_info, + * but *after* calling udp_tunnel_drop_rx_info. + */ + if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { + dev->features = features; + udp_tunnel_get_rx_info(dev); + } else { + udp_tunnel_drop_rx_info(dev); + } + } + dev->features = features; + } return err < 0 ? 0 : 1; } @@ -7516,6 +7550,12 @@ int register_netdevice(struct net_device *dev) */ dev->hw_features |= NETIF_F_SOFT_FEATURES; dev->features |= NETIF_F_SOFT_FEATURES; + + if (dev->netdev_ops->ndo_udp_tunnel_add) { + dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; + dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; + } + dev->wanted_features = dev->features & dev->hw_features; if (!(dev->flags & IFF_LOOPBACK)) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 674b6c9cec18..6a582ae4c5d9 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -76,7 +76,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_LRO_BIT] = "rx-lro", [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", - [NETIF_F_UFO_BIT] = "tx-udp-fragmentation", [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", @@ -106,6 +105,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_HW_TC_BIT] = "hw-tc-offload", [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", + [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", }; static const char @@ -299,9 +299,6 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) case ETHTOOL_GTSO: case ETHTOOL_STSO: return NETIF_F_ALL_TSO; - case ETHTOOL_GUFO: - case ETHTOOL_SUFO: - return NETIF_F_UFO; case ETHTOOL_GGSO: case ETHTOOL_SGSO: return NETIF_F_GSO; @@ -2515,6 +2512,33 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr) return ret; } +static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_fecparam fecparam = { ETHTOOL_GFECPARAM }; + + if (!dev->ethtool_ops->get_fecparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_fecparam(dev, &fecparam); + + if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_fecparam fecparam; + + if (!dev->ethtool_ops->set_fecparam) + return -EOPNOTSUPP; + + if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) + return -EFAULT; + + return dev->ethtool_ops->set_fecparam(dev, &fecparam); +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -2555,7 +2579,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPHYSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: - case ETHTOOL_GUFO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: case ETHTOOL_GFLAGS: @@ -2574,6 +2597,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GTUNABLE: case ETHTOOL_PHY_GTUNABLE: case ETHTOOL_GLINKSETTINGS: + case ETHTOOL_GFECPARAM: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -2723,7 +2747,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXCSUM: case ETHTOOL_GSG: case ETHTOOL_GTSO: - case ETHTOOL_GUFO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: rc = ethtool_get_one_feature(dev, useraddr, ethcmd); @@ -2732,7 +2755,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SRXCSUM: case ETHTOOL_SSG: case ETHTOOL_STSO: - case ETHTOOL_SUFO: case ETHTOOL_SGSO: case ETHTOOL_SGRO: rc = ethtool_set_one_feature(dev, useraddr, ethcmd); @@ -2785,6 +2807,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_PHY_STUNABLE: rc = set_phy_tunable(dev, useraddr); break; + case ETHTOOL_GFECPARAM: + rc = ethtool_get_fecparam(dev, useraddr); + break; + case ETHTOOL_SFECPARAM: + rc = ethtool_set_fecparam(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/filter.c b/net/core/filter.c index f44fc22fd45a..7e9708653c6f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -55,6 +55,7 @@ #include <net/sock_reuseport.h> #include <net/busy_poll.h> #include <net/tcp.h> +#include <linux/bpf_trace.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -1778,6 +1779,8 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { struct redirect_info { u32 ifindex; u32 flags; + struct bpf_map *map; + struct bpf_map *map_to_flush; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -1791,6 +1794,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; + ri->map = NULL; return TC_ACT_REDIRECT; } @@ -1818,6 +1822,29 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + + if (unlikely(flags)) + return XDP_ABORTED; + + ri->ifindex = ifindex; + ri->flags = flags; + ri->map = map; + + return XDP_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_map_proto = { + .func = bpf_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -2024,8 +2051,8 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) return ret; if (skb_is_gso(skb)) { - /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to - * be changed into SKB_GSO_TCPV6. + /* SKB_GSO_TCPV4 needs to be changed into + * SKB_GSO_TCPV6. */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4; @@ -2060,8 +2087,8 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) return ret; if (skb_is_gso(skb)) { - /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to - * be changed into SKB_GSO_TCPV4. + /* SKB_GSO_TCPV6 needs to be changed into + * SKB_GSO_TCPV4. */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6; @@ -2412,6 +2439,140 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +static int __bpf_tx_xdp(struct net_device *dev, + struct bpf_map *map, + struct xdp_buff *xdp, + u32 index) +{ + int err; + + if (!dev->netdev_ops->ndo_xdp_xmit) { + bpf_warn_invalid_xdp_redirect(dev->ifindex); + return -EOPNOTSUPP; + } + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + if (err) + return err; + + if (map) + __dev_map_insert_ctx(map, index); + else + dev->netdev_ops->ndo_xdp_flush(dev); + + return err; +} + +void xdp_do_flush_map(void) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_map *map = ri->map_to_flush; + + ri->map = NULL; + ri->map_to_flush = NULL; + + if (map) + __dev_map_flush(map); +} +EXPORT_SYMBOL_GPL(xdp_do_flush_map); + +int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_map *map = ri->map; + u32 index = ri->ifindex; + struct net_device *fwd; + int err = -EINVAL; + + ri->ifindex = 0; + ri->map = NULL; + + fwd = __dev_map_lookup_elem(map, index); + if (!fwd) + goto out; + + if (ri->map_to_flush && (ri->map_to_flush != map)) + xdp_do_flush_map(); + + err = __bpf_tx_xdp(fwd, map, xdp, index); + if (likely(!err)) + ri->map_to_flush = map; + +out: + trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT); + return err; +} + +int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct net_device *fwd; + + if (ri->map) + return xdp_do_redirect_map(dev, xdp, xdp_prog); + + fwd = dev_get_by_index_rcu(dev_net(dev), ri->ifindex); + ri->ifindex = 0; + ri->map = NULL; + if (unlikely(!fwd)) { + bpf_warn_invalid_xdp_redirect(ri->ifindex); + return -EINVAL; + } + + trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT); + + return __bpf_tx_xdp(fwd, NULL, xdp, 0); +} +EXPORT_SYMBOL_GPL(xdp_do_redirect); + +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + unsigned int len; + + dev = dev_get_by_index_rcu(dev_net(dev), ri->ifindex); + ri->ifindex = 0; + if (unlikely(!dev)) { + bpf_warn_invalid_xdp_redirect(ri->ifindex); + goto err; + } + + if (unlikely(!(dev->flags & IFF_UP))) + goto err; + + len = dev->mtu + dev->hard_header_len + VLAN_HLEN; + if (skb->len > len) + goto err; + + skb->dev = dev; + return 0; +err: + return -EINVAL; +} +EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); + +BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + + if (unlikely(flags)) + return XDP_ABORTED; + + ri->ifindex = ifindex; + ri->flags = flags; + return XDP_REDIRECT; +} + +static const struct bpf_func_proto bpf_xdp_redirect_proto = { + .func = bpf_xdp_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || @@ -3011,6 +3172,10 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; + case BPF_FUNC_redirect: + return &bpf_xdp_redirect_proto; + case BPF_FUNC_redirect_map: + return &bpf_redirect_map_proto; default: return bpf_base_func_proto(func_id); } @@ -3310,6 +3475,11 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +void bpf_warn_invalid_xdp_redirect(u32 ifindex) +{ + WARN_ONCE(1, "Illegal XDP redirect to unsupported device ifindex(%i)\n", ifindex); +} + static bool __is_valid_sock_ops_access(int off, int size) { if (off < 0 || off >= sizeof(struct bpf_sock_ops)) diff --git a/net/core/flow.c b/net/core/flow.c deleted file mode 100644 index f7f5d1932a27..000000000000 --- a/net/core/flow.c +++ /dev/null @@ -1,516 +0,0 @@ -/* flow.c: Generic flow cache. - * - * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru) - * Copyright (C) 2003 David S. Miller (davem@redhat.com) - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/list.h> -#include <linux/jhash.h> -#include <linux/interrupt.h> -#include <linux/mm.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/smp.h> -#include <linux/completion.h> -#include <linux/percpu.h> -#include <linux/bitops.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/mutex.h> -#include <net/flow.h> -#include <linux/atomic.h> -#include <linux/security.h> -#include <net/net_namespace.h> - -struct flow_cache_entry { - union { - struct hlist_node hlist; - struct list_head gc_list; - } u; - struct net *net; - u16 family; - u8 dir; - u32 genid; - struct flowi key; - struct flow_cache_object *object; -}; - -struct flow_flush_info { - struct flow_cache *cache; - atomic_t cpuleft; - struct completion completion; -}; - -static struct kmem_cache *flow_cachep __read_mostly; - -#define flow_cache_hash_size(cache) (1U << (cache)->hash_shift) -#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) - -static void flow_cache_new_hashrnd(unsigned long arg) -{ - struct flow_cache *fc = (void *) arg; - int i; - - for_each_possible_cpu(i) - per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1; - - fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&fc->rnd_timer); -} - -static int flow_entry_valid(struct flow_cache_entry *fle, - struct netns_xfrm *xfrm) -{ - if (atomic_read(&xfrm->flow_cache_genid) != fle->genid) - return 0; - if (fle->object && !fle->object->ops->check(fle->object)) - return 0; - return 1; -} - -static void flow_entry_kill(struct flow_cache_entry *fle, - struct netns_xfrm *xfrm) -{ - if (fle->object) - fle->object->ops->delete(fle->object); - kmem_cache_free(flow_cachep, fle); -} - -static void flow_cache_gc_task(struct work_struct *work) -{ - struct list_head gc_list; - struct flow_cache_entry *fce, *n; - struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, - flow_cache_gc_work); - - INIT_LIST_HEAD(&gc_list); - spin_lock_bh(&xfrm->flow_cache_gc_lock); - list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list); - spin_unlock_bh(&xfrm->flow_cache_gc_lock); - - list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) { - flow_entry_kill(fce, xfrm); - atomic_dec(&xfrm->flow_cache_gc_count); - } -} - -static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, - unsigned int deleted, - struct list_head *gc_list, - struct netns_xfrm *xfrm) -{ - if (deleted) { - atomic_add(deleted, &xfrm->flow_cache_gc_count); - fcp->hash_count -= deleted; - spin_lock_bh(&xfrm->flow_cache_gc_lock); - list_splice_tail(gc_list, &xfrm->flow_cache_gc_list); - spin_unlock_bh(&xfrm->flow_cache_gc_lock); - schedule_work(&xfrm->flow_cache_gc_work); - } -} - -static void __flow_cache_shrink(struct flow_cache *fc, - struct flow_cache_percpu *fcp, - unsigned int shrink_to) -{ - struct flow_cache_entry *fle; - struct hlist_node *tmp; - LIST_HEAD(gc_list); - unsigned int deleted = 0; - struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, - flow_cache_global); - unsigned int i; - - for (i = 0; i < flow_cache_hash_size(fc); i++) { - unsigned int saved = 0; - - hlist_for_each_entry_safe(fle, tmp, - &fcp->hash_table[i], u.hlist) { - if (saved < shrink_to && - flow_entry_valid(fle, xfrm)) { - saved++; - } else { - deleted++; - hlist_del(&fle->u.hlist); - list_add_tail(&fle->u.gc_list, &gc_list); - } - } - } - - flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm); -} - -static void flow_cache_shrink(struct flow_cache *fc, - struct flow_cache_percpu *fcp) -{ - unsigned int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); - - __flow_cache_shrink(fc, fcp, shrink_to); -} - -static void flow_new_hash_rnd(struct flow_cache *fc, - struct flow_cache_percpu *fcp) -{ - get_random_bytes(&fcp->hash_rnd, sizeof(u32)); - fcp->hash_rnd_recalc = 0; - __flow_cache_shrink(fc, fcp, 0); -} - -static u32 flow_hash_code(struct flow_cache *fc, - struct flow_cache_percpu *fcp, - const struct flowi *key, - unsigned int keysize) -{ - const u32 *k = (const u32 *) key; - const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32); - - return jhash2(k, length, fcp->hash_rnd) - & (flow_cache_hash_size(fc) - 1); -} - -/* I hear what you're saying, use memcmp. But memcmp cannot make - * important assumptions that we can here, such as alignment. - */ -static int flow_key_compare(const struct flowi *key1, const struct flowi *key2, - unsigned int keysize) -{ - const flow_compare_t *k1, *k1_lim, *k2; - - k1 = (const flow_compare_t *) key1; - k1_lim = k1 + keysize; - - k2 = (const flow_compare_t *) key2; - - do { - if (*k1++ != *k2++) - return 1; - } while (k1 < k1_lim); - - return 0; -} - -struct flow_cache_object * -flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, - flow_resolve_t resolver, void *ctx) -{ - struct flow_cache *fc = &net->xfrm.flow_cache_global; - struct flow_cache_percpu *fcp; - struct flow_cache_entry *fle, *tfle; - struct flow_cache_object *flo; - unsigned int keysize; - unsigned int hash; - - local_bh_disable(); - fcp = this_cpu_ptr(fc->percpu); - - fle = NULL; - flo = NULL; - - keysize = flow_key_size(family); - if (!keysize) - goto nocache; - - /* Packet really early in init? Making flow_cache_init a - * pre-smp initcall would solve this. --RR */ - if (!fcp->hash_table) - goto nocache; - - if (fcp->hash_rnd_recalc) - flow_new_hash_rnd(fc, fcp); - - hash = flow_hash_code(fc, fcp, key, keysize); - hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) { - if (tfle->net == net && - tfle->family == family && - tfle->dir == dir && - flow_key_compare(key, &tfle->key, keysize) == 0) { - fle = tfle; - break; - } - } - - if (unlikely(!fle)) { - if (fcp->hash_count > fc->high_watermark) - flow_cache_shrink(fc, fcp); - - if (atomic_read(&net->xfrm.flow_cache_gc_count) > - 2 * num_online_cpus() * fc->high_watermark) { - flo = ERR_PTR(-ENOBUFS); - goto ret_object; - } - - fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); - if (fle) { - fle->net = net; - fle->family = family; - fle->dir = dir; - memcpy(&fle->key, key, keysize * sizeof(flow_compare_t)); - fle->object = NULL; - hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]); - fcp->hash_count++; - } - } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) { - flo = fle->object; - if (!flo) - goto ret_object; - flo = flo->ops->get(flo); - if (flo) - goto ret_object; - } else if (fle->object) { - flo = fle->object; - flo->ops->delete(flo); - fle->object = NULL; - } - -nocache: - flo = NULL; - if (fle) { - flo = fle->object; - fle->object = NULL; - } - flo = resolver(net, key, family, dir, flo, ctx); - if (fle) { - fle->genid = atomic_read(&net->xfrm.flow_cache_genid); - if (!IS_ERR(flo)) - fle->object = flo; - else - fle->genid--; - } else { - if (!IS_ERR_OR_NULL(flo)) - flo->ops->delete(flo); - } -ret_object: - local_bh_enable(); - return flo; -} -EXPORT_SYMBOL(flow_cache_lookup); - -static void flow_cache_flush_tasklet(unsigned long data) -{ - struct flow_flush_info *info = (void *)data; - struct flow_cache *fc = info->cache; - struct flow_cache_percpu *fcp; - struct flow_cache_entry *fle; - struct hlist_node *tmp; - LIST_HEAD(gc_list); - unsigned int deleted = 0; - struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, - flow_cache_global); - unsigned int i; - - fcp = this_cpu_ptr(fc->percpu); - for (i = 0; i < flow_cache_hash_size(fc); i++) { - hlist_for_each_entry_safe(fle, tmp, - &fcp->hash_table[i], u.hlist) { - if (flow_entry_valid(fle, xfrm)) - continue; - - deleted++; - hlist_del(&fle->u.hlist); - list_add_tail(&fle->u.gc_list, &gc_list); - } - } - - flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm); - - if (atomic_dec_and_test(&info->cpuleft)) - complete(&info->completion); -} - -/* - * Return whether a cpu needs flushing. Conservatively, we assume - * the presence of any entries means the core may require flushing, - * since the flow_cache_ops.check() function may assume it's running - * on the same core as the per-cpu cache component. - */ -static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu) -{ - struct flow_cache_percpu *fcp; - unsigned int i; - - fcp = per_cpu_ptr(fc->percpu, cpu); - for (i = 0; i < flow_cache_hash_size(fc); i++) - if (!hlist_empty(&fcp->hash_table[i])) - return 0; - return 1; -} - -static void flow_cache_flush_per_cpu(void *data) -{ - struct flow_flush_info *info = data; - struct tasklet_struct *tasklet; - - tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet; - tasklet->data = (unsigned long)info; - tasklet_schedule(tasklet); -} - -void flow_cache_flush(struct net *net) -{ - struct flow_flush_info info; - cpumask_var_t mask; - int i, self; - - /* Track which cpus need flushing to avoid disturbing all cores. */ - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return; - cpumask_clear(mask); - - /* Don't want cpus going down or up during this. */ - get_online_cpus(); - mutex_lock(&net->xfrm.flow_flush_sem); - info.cache = &net->xfrm.flow_cache_global; - for_each_online_cpu(i) - if (!flow_cache_percpu_empty(info.cache, i)) - cpumask_set_cpu(i, mask); - atomic_set(&info.cpuleft, cpumask_weight(mask)); - if (atomic_read(&info.cpuleft) == 0) - goto done; - - init_completion(&info.completion); - - local_bh_disable(); - self = cpumask_test_and_clear_cpu(smp_processor_id(), mask); - on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0); - if (self) - flow_cache_flush_tasklet((unsigned long)&info); - local_bh_enable(); - - wait_for_completion(&info.completion); - -done: - mutex_unlock(&net->xfrm.flow_flush_sem); - put_online_cpus(); - free_cpumask_var(mask); -} - -static void flow_cache_flush_task(struct work_struct *work) -{ - struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, - flow_cache_flush_work); - struct net *net = container_of(xfrm, struct net, xfrm); - - flow_cache_flush(net); -} - -void flow_cache_flush_deferred(struct net *net) -{ - schedule_work(&net->xfrm.flow_cache_flush_work); -} - -static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) -{ - struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); - unsigned int sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); - - if (!fcp->hash_table) { - fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); - if (!fcp->hash_table) { - pr_err("NET: failed to allocate flow cache sz %u\n", sz); - return -ENOMEM; - } - fcp->hash_rnd_recalc = 1; - fcp->hash_count = 0; - tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); - } - return 0; -} - -static int flow_cache_cpu_up_prep(unsigned int cpu, struct hlist_node *node) -{ - struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node); - - return flow_cache_cpu_prepare(fc, cpu); -} - -static int flow_cache_cpu_dead(unsigned int cpu, struct hlist_node *node) -{ - struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node); - struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); - - __flow_cache_shrink(fc, fcp, 0); - return 0; -} - -int flow_cache_init(struct net *net) -{ - int i; - struct flow_cache *fc = &net->xfrm.flow_cache_global; - - if (!flow_cachep) - flow_cachep = kmem_cache_create("flow_cache", - sizeof(struct flow_cache_entry), - 0, SLAB_PANIC, NULL); - spin_lock_init(&net->xfrm.flow_cache_gc_lock); - INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list); - INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task); - INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task); - mutex_init(&net->xfrm.flow_flush_sem); - atomic_set(&net->xfrm.flow_cache_gc_count, 0); - - fc->hash_shift = 10; - fc->low_watermark = 2 * flow_cache_hash_size(fc); - fc->high_watermark = 4 * flow_cache_hash_size(fc); - - fc->percpu = alloc_percpu(struct flow_cache_percpu); - if (!fc->percpu) - return -ENOMEM; - - if (cpuhp_state_add_instance(CPUHP_NET_FLOW_PREPARE, &fc->node)) - goto err; - - setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, - (unsigned long) fc); - fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&fc->rnd_timer); - - return 0; - -err: - for_each_possible_cpu(i) { - struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); - kfree(fcp->hash_table); - fcp->hash_table = NULL; - } - - free_percpu(fc->percpu); - fc->percpu = NULL; - - return -ENOMEM; -} -EXPORT_SYMBOL(flow_cache_init); - -void flow_cache_fini(struct net *net) -{ - int i; - struct flow_cache *fc = &net->xfrm.flow_cache_global; - - del_timer_sync(&fc->rnd_timer); - - cpuhp_state_remove_instance_nocalls(CPUHP_NET_FLOW_PREPARE, &fc->node); - - for_each_possible_cpu(i) { - struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); - kfree(fcp->hash_table); - fcp->hash_table = NULL; - } - - free_percpu(fc->percpu); - fc->percpu = NULL; -} -EXPORT_SYMBOL(flow_cache_fini); - -void __init flow_cache_hp_init(void) -{ - int ret; - - ret = cpuhp_setup_state_multi(CPUHP_NET_FLOW_PREPARE, - "net/flow:prepare", - flow_cache_cpu_up_prep, - flow_cache_cpu_dead); - WARN_ON(ret < 0); -} diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f990eb8b30a9..c27da51d14e4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -158,31 +158,6 @@ out: * */ -struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) -{ - struct sk_buff *skb; - - /* Get the HEAD */ - skb = kmem_cache_alloc_node(skbuff_head_cache, - gfp_mask & ~__GFP_DMA, node); - if (!skb) - goto out; - - /* - * Only clear those fields we need to clear, not those that we will - * actually initialise below. Hence, don't put any more fields after - * the tail pointer in struct sk_buff! - */ - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->head = NULL; - skb->truesize = sizeof(struct sk_buff); - refcount_set(&skb->users, 1); - - skb->mac_header = (typeof(skb->mac_header))~0U; -out: - return skb; -} - /** * __alloc_skb - allocate a network buffer * @size: size to allocate @@ -762,8 +737,7 @@ void consume_stateless_skb(struct sk_buff *skb) return; trace_consume_skb(skb); - if (likely(skb->head)) - skb_release_data(skb); + skb_release_data(skb); kfree_skbmem(skb); } @@ -1719,6 +1693,8 @@ pull_pages: if (eat) { skb_shinfo(skb)->frags[k].page_offset += eat; skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); + if (!i) + goto end; eat = 0; } k++; @@ -1726,6 +1702,7 @@ pull_pages: } skb_shinfo(skb)->nr_frags = k; +end: skb->tail += delta; skb->data_len -= delta; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 416ac4ef9ba9..a55e2e4087a4 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -220,6 +220,11 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, } #ifdef CONFIG_PM_SLEEP +static bool dsa_is_port_initialized(struct dsa_switch *ds, int p) +{ + return ds->enabled_port_mask & (1 << p) && ds->ports[p].netdev; +} + int dsa_switch_suspend(struct dsa_switch *ds) { int i, ret = 0; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 76c2077c3f5b..5ce44fb7d498 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1219,10 +1219,9 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { - bool udpfrag = false, fixedid = false, gso_partial, encap; + bool fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; - unsigned int offset = 0; struct iphdr *iph; int proto, tot_len; int nhoff; @@ -1257,7 +1256,6 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); if (!skb->encapsulation || encap) { - udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ @@ -1277,13 +1275,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); - if (udpfrag) { - iph->frag_off = htons(offset >> 3); - if (skb->next) - iph->frag_off |= htons(IP_MF); - offset += skb->len - nhoff - ihl; - tot_len = skb->len - nhoff; - } else if (skb_is_gso(skb)) { + if (skb_is_gso(skb)) { if (!fixedid) { iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index d5cac99170b1..416bb304a281 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; - bool need_csum, ufo, gso_partial; + bool need_csum, gso_partial; if (!skb->encapsulation) goto out; @@ -47,20 +47,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); skb->encap_hdr_csum = need_csum; - ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); - features &= skb->dev->hw_enc_features; - /* The only checksum offload we care about from here on out is the - * outer one so strip the existing checksum feature flags based - * on the fact that we will be computing our checksum in software. - */ - if (ufo) { - features &= ~NETIF_F_CSUM_MASK; - if (!need_csum) - features |= NETIF_F_HW_CSUM; - } - /* segment inner packet. */ segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index c5a117cc6619..337ad41bb80a 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -33,7 +33,7 @@ * also be removed if the pool is overloaded i.e. if the total amount of * entries is greater-or-equal than the threshold. * - * Node pool is organised as an AVL tree. + * Node pool is organised as an RB tree. * Such an implementation has been chosen not just for fun. It's a way to * prevent easy and efficient DoS attacks by creating hash collisions. A huge * amount of long living nodes in a single hash slot would significantly delay @@ -45,7 +45,7 @@ * AND reference count being 0. * 3. Global variable peer_total is modified under the pool lock. * 4. struct inet_peer fields modification: - * avl_left, avl_right, avl_parent, avl_height: pool lock + * rb_node: pool lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable @@ -53,30 +53,15 @@ static struct kmem_cache *peer_cachep __read_mostly; -static LIST_HEAD(gc_list); -static const int gc_delay = 60 * HZ; -static struct delayed_work gc_work; -static DEFINE_SPINLOCK(gc_lock); - -#define node_height(x) x->avl_height - -#define peer_avl_empty ((struct inet_peer *)&peer_fake_node) -#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node) -static const struct inet_peer peer_fake_node = { - .avl_left = peer_avl_empty_rcu, - .avl_right = peer_avl_empty_rcu, - .avl_height = 0 -}; - void inet_peer_base_init(struct inet_peer_base *bp) { - bp->root = peer_avl_empty_rcu; + bp->rb_root = RB_ROOT; seqlock_init(&bp->lock); bp->total = 0; } EXPORT_SYMBOL_GPL(inet_peer_base_init); -#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ +#define PEER_MAX_GC 32 /* Exported for sysctl_net_ipv4. */ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more @@ -84,53 +69,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ -static void inetpeer_gc_worker(struct work_struct *work) -{ - struct inet_peer *p, *n, *c; - struct list_head list; - - spin_lock_bh(&gc_lock); - list_replace_init(&gc_list, &list); - spin_unlock_bh(&gc_lock); - - if (list_empty(&list)) - return; - - list_for_each_entry_safe(p, n, &list, gc_list) { - - if (need_resched()) - cond_resched(); - - c = rcu_dereference_protected(p->avl_left, 1); - if (c != peer_avl_empty) { - list_add_tail(&c->gc_list, &list); - p->avl_left = peer_avl_empty_rcu; - } - - c = rcu_dereference_protected(p->avl_right, 1); - if (c != peer_avl_empty) { - list_add_tail(&c->gc_list, &list); - p->avl_right = peer_avl_empty_rcu; - } - - n = list_entry(p->gc_list.next, struct inet_peer, gc_list); - - if (refcount_read(&p->refcnt) == 1) { - list_del(&p->gc_list); - kmem_cache_free(peer_cachep, p); - } - } - - if (list_empty(&list)) - return; - - spin_lock_bh(&gc_lock); - list_splice(&list, &gc_list); - spin_unlock_bh(&gc_lock); - - schedule_delayed_work(&gc_work, gc_delay); -} - /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) { @@ -153,225 +91,62 @@ void __init inet_initpeers(void) sizeof(struct inet_peer), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - - INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker); } -#define rcu_deref_locked(X, BASE) \ - rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) - -/* - * Called with local BH disabled and the pool lock held. - */ -#define lookup(_daddr, _stack, _base) \ -({ \ - struct inet_peer *u; \ - struct inet_peer __rcu **v; \ - \ - stackptr = _stack; \ - *stackptr++ = &_base->root; \ - for (u = rcu_deref_locked(_base->root, _base); \ - u != peer_avl_empty;) { \ - int cmp = inetpeer_addr_cmp(_daddr, &u->daddr); \ - if (cmp == 0) \ - break; \ - if (cmp == -1) \ - v = &u->avl_left; \ - else \ - v = &u->avl_right; \ - *stackptr++ = v; \ - u = rcu_deref_locked(*v, _base); \ - } \ - u; \ -}) - -/* - * Called with rcu_read_lock() - * Because we hold no lock against a writer, its quite possible we fall - * in an endless loop. - * But every pointer we follow is guaranteed to be valid thanks to RCU. - * We exit from this function if number of links exceeds PEER_MAXDEPTH - */ -static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, - struct inet_peer_base *base) +/* Called with rcu_read_lock() or base->lock held */ +static struct inet_peer *lookup(const struct inetpeer_addr *daddr, + struct inet_peer_base *base, + unsigned int seq, + struct inet_peer *gc_stack[], + unsigned int *gc_cnt, + struct rb_node **parent_p, + struct rb_node ***pp_p) { - struct inet_peer *u = rcu_dereference(base->root); - int count = 0; + struct rb_node **pp, *parent; + struct inet_peer *p; + + pp = &base->rb_root.rb_node; + parent = NULL; + while (*pp) { + int cmp; - while (u != peer_avl_empty) { - int cmp = inetpeer_addr_cmp(daddr, &u->daddr); + parent = rcu_dereference_raw(*pp); + p = rb_entry(parent, struct inet_peer, rb_node); + cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { - /* Before taking a reference, check if this entry was - * deleted (refcnt=0) - */ - if (!refcount_inc_not_zero(&u->refcnt)) { - u = NULL; - } - return u; + if (!refcount_inc_not_zero(&p->refcnt)) + break; + return p; + } + if (gc_stack) { + if (*gc_cnt < PEER_MAX_GC) + gc_stack[(*gc_cnt)++] = p; + } else if (unlikely(read_seqretry(&base->lock, seq))) { + break; } if (cmp == -1) - u = rcu_dereference(u->avl_left); + pp = &(*pp)->rb_left; else - u = rcu_dereference(u->avl_right); - if (unlikely(++count == PEER_MAXDEPTH)) - break; + pp = &(*pp)->rb_right; } + *parent_p = parent; + *pp_p = pp; return NULL; } -/* Called with local BH disabled and the pool lock held. */ -#define lookup_rightempty(start, base) \ -({ \ - struct inet_peer *u; \ - struct inet_peer __rcu **v; \ - *stackptr++ = &start->avl_left; \ - v = &start->avl_left; \ - for (u = rcu_deref_locked(*v, base); \ - u->avl_right != peer_avl_empty_rcu;) { \ - v = &u->avl_right; \ - *stackptr++ = v; \ - u = rcu_deref_locked(*v, base); \ - } \ - u; \ -}) - -/* Called with local BH disabled and the pool lock held. - * Variable names are the proof of operation correctness. - * Look into mm/map_avl.c for more detail description of the ideas. - */ -static void peer_avl_rebalance(struct inet_peer __rcu **stack[], - struct inet_peer __rcu ***stackend, - struct inet_peer_base *base) -{ - struct inet_peer __rcu **nodep; - struct inet_peer *node, *l, *r; - int lh, rh; - - while (stackend > stack) { - nodep = *--stackend; - node = rcu_deref_locked(*nodep, base); - l = rcu_deref_locked(node->avl_left, base); - r = rcu_deref_locked(node->avl_right, base); - lh = node_height(l); - rh = node_height(r); - if (lh > rh + 1) { /* l: RH+2 */ - struct inet_peer *ll, *lr, *lrl, *lrr; - int lrh; - ll = rcu_deref_locked(l->avl_left, base); - lr = rcu_deref_locked(l->avl_right, base); - lrh = node_height(lr); - if (lrh <= node_height(ll)) { /* ll: RH+1 */ - RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ - RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ - node->avl_height = lrh + 1; /* RH+1 or RH+2 */ - RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */ - RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */ - l->avl_height = node->avl_height + 1; - RCU_INIT_POINTER(*nodep, l); - } else { /* ll: RH, lr: RH+1 */ - lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */ - lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */ - RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ - RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ - node->avl_height = rh + 1; /* node: RH+1 */ - RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */ - RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */ - l->avl_height = rh + 1; /* l: RH+1 */ - RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */ - RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */ - lr->avl_height = rh + 2; - RCU_INIT_POINTER(*nodep, lr); - } - } else if (rh > lh + 1) { /* r: LH+2 */ - struct inet_peer *rr, *rl, *rlr, *rll; - int rlh; - rr = rcu_deref_locked(r->avl_right, base); - rl = rcu_deref_locked(r->avl_left, base); - rlh = node_height(rl); - if (rlh <= node_height(rr)) { /* rr: LH+1 */ - RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ - RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ - node->avl_height = rlh + 1; /* LH+1 or LH+2 */ - RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */ - RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */ - r->avl_height = node->avl_height + 1; - RCU_INIT_POINTER(*nodep, r); - } else { /* rr: RH, rl: RH+1 */ - rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */ - rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */ - RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ - RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ - node->avl_height = lh + 1; /* node: LH+1 */ - RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */ - RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */ - r->avl_height = lh + 1; /* r: LH+1 */ - RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */ - RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */ - rl->avl_height = lh + 2; - RCU_INIT_POINTER(*nodep, rl); - } - } else { - node->avl_height = (lh > rh ? lh : rh) + 1; - } - } -} - -/* Called with local BH disabled and the pool lock held. */ -#define link_to_pool(n, base) \ -do { \ - n->avl_height = 1; \ - n->avl_left = peer_avl_empty_rcu; \ - n->avl_right = peer_avl_empty_rcu; \ - /* lockless readers can catch us now */ \ - rcu_assign_pointer(**--stackptr, n); \ - peer_avl_rebalance(stack, stackptr, base); \ -} while (0) - static void inetpeer_free_rcu(struct rcu_head *head) { kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); } -static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, - struct inet_peer __rcu **stack[PEER_MAXDEPTH]) -{ - struct inet_peer __rcu ***stackptr, ***delp; - - if (lookup(&p->daddr, stack, base) != p) - BUG(); - delp = stackptr - 1; /* *delp[0] == p */ - if (p->avl_left == peer_avl_empty_rcu) { - *delp[0] = p->avl_right; - --stackptr; - } else { - /* look for a node to insert instead of p */ - struct inet_peer *t; - t = lookup_rightempty(p, base); - BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); - **--stackptr = t->avl_left; - /* t is removed, t->daddr > x->daddr for any - * x in p->avl_left subtree. - * Put t in the old place of p. */ - RCU_INIT_POINTER(*delp[0], t); - t->avl_left = p->avl_left; - t->avl_right = p->avl_right; - t->avl_height = p->avl_height; - BUG_ON(delp[1] != &p->avl_left); - delp[1] = &t->avl_left; /* was &p->avl_left */ - } - peer_avl_rebalance(stack, stackptr, base); - base->total--; - call_rcu(&p->rcu, inetpeer_free_rcu); -} - /* perform garbage collect on all items stacked during a lookup */ -static int inet_peer_gc(struct inet_peer_base *base, - struct inet_peer __rcu **stack[PEER_MAXDEPTH], - struct inet_peer __rcu ***stackptr) +static void inet_peer_gc(struct inet_peer_base *base, + struct inet_peer *gc_stack[], + unsigned int gc_cnt) { - struct inet_peer *p, *gchead = NULL; + struct inet_peer *p; __u32 delta, ttl; - int cnt = 0; + int i; if (base->total >= inet_peer_threshold) ttl = 0; /* be aggressive */ @@ -379,43 +154,38 @@ static int inet_peer_gc(struct inet_peer_base *base, ttl = inet_peer_maxttl - (inet_peer_maxttl - inet_peer_minttl) / HZ * base->total / inet_peer_threshold * HZ; - stackptr--; /* last stack slot is peer_avl_empty */ - while (stackptr > stack) { - stackptr--; - p = rcu_deref_locked(**stackptr, base); - if (refcount_read(&p->refcnt) == 1) { - smp_rmb(); - delta = (__u32)jiffies - p->dtime; - if (delta >= ttl && refcount_dec_if_one(&p->refcnt)) { - p->gc_next = gchead; - gchead = p; - } - } + for (i = 0; i < gc_cnt; i++) { + p = gc_stack[i]; + delta = (__u32)jiffies - p->dtime; + if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) + gc_stack[i] = NULL; } - while ((p = gchead) != NULL) { - gchead = p->gc_next; - cnt++; - unlink_from_pool(p, base, stack); + for (i = 0; i < gc_cnt; i++) { + p = gc_stack[i]; + if (p) { + rb_erase(&p->rb_node, &base->rb_root); + base->total--; + call_rcu(&p->rcu, inetpeer_free_rcu); + } } - return cnt; } struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, int create) { - struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; - struct inet_peer *p; - unsigned int sequence; - int invalidated, gccnt = 0; + struct inet_peer *p, *gc_stack[PEER_MAX_GC]; + struct rb_node **pp, *parent; + unsigned int gc_cnt, seq; + int invalidated; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ rcu_read_lock(); - sequence = read_seqbegin(&base->lock); - p = lookup_rcu(daddr, base); - invalidated = read_seqretry(&base->lock, sequence); + seq = read_seqbegin(&base->lock); + p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); + invalidated = read_seqretry(&base->lock, seq); rcu_read_unlock(); if (p) @@ -428,36 +198,31 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ + parent = NULL; write_seqlock_bh(&base->lock); -relookup: - p = lookup(daddr, stack, base); - if (p != peer_avl_empty) { - refcount_inc(&p->refcnt); - write_sequnlock_bh(&base->lock); - return p; - } - if (!gccnt) { - gccnt = inet_peer_gc(base, stack, stackptr); - if (gccnt && create) - goto relookup; - } - p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; - if (p) { - p->daddr = *daddr; - refcount_set(&p->refcnt, 2); - atomic_set(&p->rid, 0); - p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; - p->rate_tokens = 0; - /* 60*HZ is arbitrary, but chosen enough high so that the first - * calculation of tokens is at its maximum. - */ - p->rate_last = jiffies - 60*HZ; - INIT_LIST_HEAD(&p->gc_list); - /* Link the node. */ - link_to_pool(p, base); - base->total++; + gc_cnt = 0; + p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); + if (!p && create) { + p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); + if (p) { + p->daddr = *daddr; + refcount_set(&p->refcnt, 2); + atomic_set(&p->rid, 0); + p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; + p->rate_tokens = 0; + /* 60*HZ is arbitrary, but chosen enough high so that the first + * calculation of tokens is at its maximum. + */ + p->rate_last = jiffies - 60*HZ; + + rb_link_node(&p->rb_node, parent, pp); + rb_insert_color(&p->rb_node, &base->rb_root); + base->total++; + } } + if (gc_cnt) + inet_peer_gc(base, gc_stack, gc_cnt); write_sequnlock_bh(&base->lock); return p; @@ -467,8 +232,9 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { p->dtime = (__u32)jiffies; - smp_mb__before_atomic(); - refcount_dec(&p->refcnt); + + if (refcount_dec_and_test(&p->refcnt)) + call_rcu(&p->rcu, inetpeer_free_rcu); } EXPORT_SYMBOL_GPL(inet_putpeer); @@ -513,30 +279,16 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) } EXPORT_SYMBOL(inet_peer_xrlim_allow); -static void inetpeer_inval_rcu(struct rcu_head *head) -{ - struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu); - - spin_lock_bh(&gc_lock); - list_add_tail(&p->gc_list, &gc_list); - spin_unlock_bh(&gc_lock); - - schedule_delayed_work(&gc_work, gc_delay); -} - void inetpeer_invalidate_tree(struct inet_peer_base *base) { - struct inet_peer *root; - - write_seqlock_bh(&base->lock); + struct inet_peer *p, *n; - root = rcu_deref_locked(base->root, base); - if (root != peer_avl_empty) { - base->root = peer_avl_empty_rcu; - base->total = 0; - call_rcu(&root->gc_rcu, inetpeer_inval_rcu); + rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) { + inet_putpeer(p); + cond_resched(); } - write_sequnlock_bh(&base->lock); + base->rb_root = RB_ROOT; + base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 50c74cd890bc..b631ec685d77 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -853,61 +853,6 @@ csum_page(struct page *page, int offset, int copy) return csum; } -static inline int ip_ufo_append_data(struct sock *sk, - struct sk_buff_head *queue, - int getfrag(void *from, char *to, int offset, int len, - int odd, struct sk_buff *skb), - void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int maxfraglen, unsigned int flags) -{ - struct sk_buff *skb; - int err; - - /* There is support for UDP fragmentation offload by network - * device, so create one single skb packet containing complete - * udp datagram - */ - skb = skb_peek_tail(queue); - if (!skb) { - skb = sock_alloc_send_skb(sk, - hh_len + fragheaderlen + transhdrlen + 20, - (flags & MSG_DONTWAIT), &err); - - if (!skb) - return err; - - /* reserve space for Hardware header */ - skb_reserve(skb, hh_len); - - /* create space for UDP/IP header */ - skb_put(skb, fragheaderlen + transhdrlen); - - /* initialize network header pointer */ - skb_reset_network_header(skb); - - /* initialize protocol header pointer */ - skb->transport_header = skb->network_header + fragheaderlen; - - skb->csum = 0; - - if (flags & MSG_CONFIRM) - skb_set_dst_pending_confirm(skb, 1); - - __skb_queue_tail(queue, skb); - } else if (skb_is_gso(skb)) { - goto append; - } - - skb->ip_summed = CHECKSUM_PARTIAL; - /* specify the length of each IP datagram fragment */ - skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - -append: - return skb_append_datato_frags(sk, skb, getfrag, from, - (length - transhdrlen)); -} - static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, @@ -965,18 +910,6 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if ((((length + (skb ? skb->len : fragheaderlen)) > mtu) || - (skb && skb_is_gso(skb))) && - (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && - (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { - err = ip_ufo_append_data(sk, queue, getfrag, from, length, - hh_len, fragheaderlen, transhdrlen, - maxfraglen, flags); - if (err) - goto error; - return 0; - } /* So, what's going on in the loop below? * @@ -1287,15 +1220,6 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (!skb) return -EINVAL; - if ((size + skb->len > mtu) && - (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO)) { - if (skb->ip_summed != CHECKSUM_PARTIAL) - return -EOPNOTSUPP; - - skb_shinfo(skb)->gso_size = mtu - fragheaderlen; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - } cork->length += size; while (size > 0) { diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 0192c255e508..5ed63d250950 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -584,33 +584,6 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = { .get_link_net = ip_tunnel_get_link_net, }; -static bool is_vti_tunnel(const struct net_device *dev) -{ - return dev->netdev_ops == &vti_netdev_ops; -} - -static int vti_device_event(struct notifier_block *unused, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct ip_tunnel *tunnel = netdev_priv(dev); - - if (!is_vti_tunnel(dev)) - return NOTIFY_DONE; - - switch (event) { - case NETDEV_DOWN: - if (!net_eq(tunnel->net, dev_net(dev))) - xfrm_garbage_collect(tunnel->net); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block vti_notifier_block __read_mostly = { - .notifier_call = vti_device_event, -}; - static int __init vti_init(void) { const char *msg; @@ -618,8 +591,6 @@ static int __init vti_init(void) pr_info("IPv4 over IPsec tunneling driver\n"); - register_netdevice_notifier(&vti_notifier_block); - msg = "tunnel device"; err = register_pernet_device(&vti_net_ops); if (err < 0) @@ -652,7 +623,6 @@ xfrm_proto_ah_failed: xfrm_proto_esp_failed: unregister_pernet_device(&vti_net_ops); pernet_dev_failed: - unregister_netdevice_notifier(&vti_notifier_block); pr_err("vti init: failed to register %s\n", msg); return err; } @@ -664,7 +634,6 @@ static void __exit vti_fini(void) xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); - unregister_netdevice_notifier(&vti_notifier_block); } module_init(vti_init); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 43eb6567b3a0..b6d3fe03feb3 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -206,14 +206,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), - SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED), - SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG), - SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE), - SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED), - SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), - SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER), SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), - SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), @@ -230,14 +223,12 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), - SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), - SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9bf809726066..0d3c038d7b04 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -45,6 +45,9 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +/* obsolete */ +static int sysctl_tcp_low_latency __read_mostly; + /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71ce33decd97..5326b50a3450 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -388,6 +388,19 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) return period; } +static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) +{ + u32 rate = READ_ONCE(tp->rate_delivered); + u32 intv = READ_ONCE(tp->rate_interval_us); + u64 rate64 = 0; + + if (rate && intv) { + rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; + do_div(rate64, intv); + } + return rate64; +} + /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to @@ -400,7 +413,6 @@ void tcp_init_sock(struct sock *sk) tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); - tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; @@ -1525,20 +1537,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } -static void tcp_prequeue_process(struct sock *sk) -{ - struct sk_buff *skb; - struct tcp_sock *tp = tcp_sk(sk); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED); - - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - - /* Clear memory counter. */ - tp->ucopy.memory = 0; -} - static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1671,7 +1669,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int err; int target; /* Read at least this many bytes */ long timeo; - struct task_struct *user_recv = NULL; struct sk_buff *skb, *last; u32 urg_hole = 0; @@ -1806,51 +1803,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, tcp_cleanup_rbuf(sk, copied); - if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { - /* Install new reader */ - if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { - user_recv = current; - tp->ucopy.task = user_recv; - tp->ucopy.msg = msg; - } - - tp->ucopy.len = len; - - WARN_ON(tp->copied_seq != tp->rcv_nxt && - !(flags & (MSG_PEEK | MSG_TRUNC))); - - /* Ugly... If prequeue is not empty, we have to - * process it before releasing socket, otherwise - * order will be broken at second iteration. - * More elegant solution is required!!! - * - * Look: we have the following (pseudo)queues: - * - * 1. packets in flight - * 2. backlog - * 3. prequeue - * 4. receive_queue - * - * Each queue can be processed only if the next ones - * are empty. At this point we have empty receive_queue. - * But prequeue _can_ be not empty after 2nd iteration, - * when we jumped to start of loop because backlog - * processing added something to receive_queue. - * We cannot release_sock(), because backlog contains - * packets arrived _after_ prequeued ones. - * - * Shortly, algorithm is clear --- to process all - * the queues in order. We could make it more directly, - * requeueing packets from backlog to prequeue, if - * is not empty. It is more elegant, but eats cycles, - * unfortunately. - */ - if (!skb_queue_empty(&tp->ucopy.prequeue)) - goto do_prequeue; - - /* __ Set realtime policy in scheduler __ */ - } - if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1859,31 +1811,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, sk_wait_data(sk, &timeo, last); } - if (user_recv) { - int chunk; - - /* __ Restore normal policy in scheduler __ */ - - chunk = len - tp->ucopy.len; - if (chunk != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); - len -= chunk; - copied += chunk; - } - - if (tp->rcv_nxt == tp->copied_seq && - !skb_queue_empty(&tp->ucopy.prequeue)) { -do_prequeue: - tcp_prequeue_process(sk); - - chunk = len - tp->ucopy.len; - if (chunk != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } - } - } if ((flags & MSG_PEEK) && (peek_seq - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", @@ -1934,10 +1861,8 @@ do_prequeue: tcp_rcv_space_adjust(sk); skip_copy: - if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { + if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) tp->urg_data = 0; - tcp_fast_path_check(sk); - } if (used + offset < skb->len) continue; @@ -1955,25 +1880,6 @@ skip_copy: break; } while (len > 0); - if (user_recv) { - if (!skb_queue_empty(&tp->ucopy.prequeue)) { - int chunk; - - tp->ucopy.len = copied > 0 ? len : 0; - - tcp_prequeue_process(sk); - - if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); - len -= chunk; - copied += chunk; - } - } - - tp->ucopy.task = NULL; - tp->ucopy.len = 0; - } - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ @@ -2823,7 +2729,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); - u32 now, intv; + u32 now; u64 rate64; bool slow; u32 rate; @@ -2922,13 +2828,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; - rate = READ_ONCE(tp->rate_delivered); - intv = READ_ONCE(tp->rate_interval_us); - if (rate && intv) { - rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; - do_div(rate64, intv); + rate64 = tcp_compute_delivery_rate(tp); + if (rate64) info->tcpi_delivery_rate = rate64; - } unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2938,8 +2840,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; + u64 rate64; + u32 rate; - stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); + stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + + 3 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -2954,6 +2860,20 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) tp->data_segs_out, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, tp->total_retrans, TCP_NLA_PAD); + + rate = READ_ONCE(sk->sk_pacing_rate); + rate64 = rate != ~0U ? rate : ~0ULL; + nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); + + rate64 = tcp_compute_delivery_rate(tp); + nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); + + nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd); + nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); + nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); + + nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); + nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); return stats; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2920e0cb09f8..af0a98d54b62 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -103,7 +103,6 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ -#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ @@ -3367,12 +3366,6 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 if (tp->snd_wnd != nwin) { tp->snd_wnd = nwin; - /* Note, it is the only place, where - * fast path is recovered for sending TCP. - */ - tp->pred_flags = 0; - tcp_fast_path_check(sk); - if (tcp_send_head(sk)) tcp_slow_start_after_idle_check(sk); @@ -3554,6 +3547,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 lost = tp->lost; int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ack_ev_flags = 0; sack_state.first_sackt = 0; sack_state.rate = &rs; @@ -3597,42 +3591,26 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (flag & FLAG_UPDATE_TS_RECENT) tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { - /* Window is constant, pure forward advance. - * No more checks are required. - * Note, we use the fact that SND.UNA>=SND.WL2. - */ - tcp_update_wl(tp, ack_seq); - tcp_snd_una_update(tp, ack); - flag |= FLAG_WIN_UPDATE; - - tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); - } else { - u32 ack_ev_flags = CA_ACK_SLOWPATH; - - if (ack_seq != TCP_SKB_CB(skb)->end_seq) - flag |= FLAG_DATA; - else - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); + if (ack_seq != TCP_SKB_CB(skb)->end_seq) + flag |= FLAG_DATA; + else + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); - flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); + flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); - if (TCP_SKB_CB(skb)->sacked) - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_state); + if (TCP_SKB_CB(skb)->sacked) + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_state); - if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { - flag |= FLAG_ECE; - ack_ev_flags |= CA_ACK_ECE; - } + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { + flag |= FLAG_ECE; + ack_ev_flags = CA_ACK_ECE; + } - if (flag & FLAG_WIN_UPDATE) - ack_ev_flags |= CA_ACK_WIN_UPDATE; + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; - tcp_in_ack_event(sk, ack_ev_flags); - } + tcp_in_ack_event(sk, ack_ev_flags); /* We passed data and got it acked, remove any soft error * log. Something worked... @@ -4398,8 +4376,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } - /* Disable header prediction. */ - tp->pred_flags = 0; inet_csk_schedule_ack(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); @@ -4611,32 +4587,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) goto out_of_window; /* Ok. In sequence. In window. */ - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && - sock_owned_by_user(sk) && !tp->urg_data) { - int chunk = min_t(unsigned int, skb->len, - tp->ucopy.len); - - __set_current_state(TASK_RUNNING); - - if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - eaten = (chunk == skb->len); - tcp_rcv_space_adjust(sk); - } - } - - if (eaten <= 0) { queue_and_out: - if (eaten < 0) { - if (skb_queue_len(&sk->sk_receive_queue) == 0) - sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) - goto drop; - } - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); + if (eaten < 0) { + if (skb_queue_len(&sk->sk_receive_queue) == 0) + sk_forced_mem_schedule(sk, skb->truesize); + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + goto drop; } + eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); @@ -4656,8 +4614,6 @@ queue_and_out: if (tp->rx_opt.num_sacks) tcp_sack_remove(tp); - tcp_fast_path_check(sk); - if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) @@ -4983,7 +4939,6 @@ static int tcp_prune_queue(struct sock *sk) NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); /* Massive buffer overcommit. */ - tp->pred_flags = 0; return -1; } @@ -5155,9 +5110,6 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) tp->urg_data = TCP_URG_NOTYET; tp->urg_seq = ptr; - - /* Disable header prediction. */ - tp->pred_flags = 0; } /* This is the 'fast' part of urgent handling. */ @@ -5186,26 +5138,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t } } -static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) -{ - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; - int err; - - if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk); - else - err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg); - - if (!err) { - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - tcp_rcv_space_adjust(sk); - } - - return err; -} - /* Accept RST for rcv_nxt - 1 after a FIN. * When tcp connections are abruptly terminated from Mac OSX (via ^C), a * FIN is sent followed by a RST packet. The RST is sent with the same @@ -5336,201 +5268,29 @@ discard: /* * TCP receive function for the ESTABLISHED state. - * - * It is split into a fast path and a slow path. The fast path is - * disabled when: - * - A zero window was announced from us - zero window probing - * is only handled properly in the slow path. - * - Out of order segments arrived. - * - Urgent data is expected. - * - There is no buffer space left - * - Unexpected TCP flags/window values/header lengths are received - * (detected by checking the TCP header against pred_flags) - * - Data is sent in both directions. Fast path only supports pure senders - * or pure receivers (this means either the sequence number or the ack - * value must stay constant) - * - Unexpected TCP option. - * - * When these conditions are not satisfied it drops into a standard - * receive procedure patterned after RFC793 to handle all cases. - * The first three cases are guaranteed by proper pred_flags setting, - * the rest is checked inline. Fast processing is turned on in - * tcp_data_queue when everything is OK. */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { + unsigned int len = skb->len; struct tcp_sock *tp = tcp_sk(sk); tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); - /* - * Header prediction. - * The code loosely follows the one in the famous - * "30 instruction TCP receive" Van Jacobson mail. - * - * Van's trick is to deposit buffers into socket queue - * on a device interrupt, to call tcp_recv function - * on the receive process context and checksum and copy - * the buffer to user space. smart... - * - * Our current scheme is not silly either but we take the - * extra cost of the net_bh soft interrupt processing... - * We do checksum and copy also but from device to kernel. - */ tp->rx_opt.saw_tstamp = 0; - /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_prediction is to be made - * 'S' will always be tp->tcp_header_len >> 2 - * '?' will be 0 for the fast path, otherwise pred_flags is 0 to - * turn it off (when there are holes in the receive - * space for instance) - * PSH flag is ignored. - */ - - if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && - TCP_SKB_CB(skb)->seq == tp->rcv_nxt && - !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { - int tcp_header_len = tp->tcp_header_len; - - /* Timestamp header prediction: tcp_header_len - * is automatically equal to th->doff*4 due to pred_flags - * match. - */ - - /* Check timestamp */ - if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { - /* No? Slow path! */ - if (!tcp_parse_aligned_timestamp(tp, th)) - goto slow_path; - - /* If PAWS failed, check it more carefully in slow path */ - if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) - goto slow_path; - - /* DO NOT update ts_recent here, if checksum fails - * and timestamp was corrupted part, it will result - * in a hung connection since we will drop all - * future packets due to the PAWS test. - */ - } - - if (len <= tcp_header_len) { - /* Bulk data transfer: sender */ - if (len == tcp_header_len) { - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - /* We know that such packets are checksummed - * on entry. - */ - tcp_ack(sk, skb, 0); - __kfree_skb(skb); - tcp_data_snd_check(sk); - return; - } else { /* Header too small */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); - goto discard; - } - } else { - int eaten = 0; - bool fragstolen = false; - - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); - - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + - TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - tcp_rcv_rtt_measure_ts(sk, skb); - - __skb_pull(skb, tcp_header_len); - tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; - } - } - if (!eaten) { - if (tcp_checksum_complete(skb)) - goto csum_error; - - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; - - /* Predicted packet is in window by definition. - * seq == rcv_nxt and rcv_wup <= rcv_nxt. - * Hence, check seq<=rcv_wup reduces to: - */ - if (tcp_header_len == - (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && - tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); - - tcp_rcv_rtt_measure_ts(sk, skb); - - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); - - /* Bulk data transfer: receiver */ - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, - &fragstolen); - } - - tcp_event_data_recv(sk, skb); - - if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { - /* Well, only one small jumplet in fast path... */ - tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk); - if (!inet_csk_ack_scheduled(sk)) - goto no_ack; - } - - __tcp_ack_snd_check(sk, 0); -no_ack: - if (eaten) - kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk); - return; - } - } - -slow_path: if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error; if (!th->ack && !th->rst && !th->syn) goto discard; - /* - * Standard slow path. - */ - if (!tcp_validate_incoming(sk, skb, th, 1)) return; -step5: - if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) + if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0) goto discard; tcp_rcv_rtt_measure_ts(sk, skb); @@ -5584,11 +5344,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); - if (!tp->rx_opt.snd_wscale) - __tcp_fast_path_on(tp, tp->snd_wnd); - else - tp->pred_flags = 0; - + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + } } static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, @@ -5717,7 +5476,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_ack(sk, skb, FLAG_SLOWPATH); + tcp_ack(sk, skb, 0); /* Ok.. it's good. Set up sequence numbers and * move to established. @@ -5953,8 +5712,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; /* step 5: check the ACK field */ - acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT | + + acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT | FLAG_NO_CHALLENGE_ACK) > 0; if (!acceptable) { @@ -6022,7 +5781,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); - tcp_fast_path_on(tp); break; case TCP_FIN_WAIT1: { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a20e7f03d5f7..9b51663cd5a4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,8 +85,6 @@ #include <crypto/hash.h> #include <linux/scatterlist.h> -int sysctl_tcp_low_latency __read_mostly; - #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -1458,7 +1456,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); + tcp_rcv_established(sk, skb, tcp_hdr(skb)); return 0; } @@ -1541,61 +1539,6 @@ void tcp_v4_early_demux(struct sk_buff *skb) } } -/* Packet is added to VJ-style prequeue for processing in process - * context, if a reader task is waiting. Apparently, this exciting - * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) - * failed somewhere. Latency? Burstiness? Well, at least now we will - * see, why it failed. 8)8) --ANK - * - */ -bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (sysctl_tcp_low_latency || !tp->ucopy.task) - return false; - - if (skb->len <= tcp_hdrlen(skb) && - skb_queue_len(&tp->ucopy.prequeue) == 0) - return false; - - /* Before escaping RCU protected region, we need to take care of skb - * dst. Prequeue is only enabled for established sockets. - * For such sockets, we might need the skb dst only to set sk->sk_rx_dst - * Instead of doing full sk_rx_dst validity here, let's perform - * an optimistic check. - */ - if (likely(sk->sk_rx_dst)) - skb_dst_drop(skb); - else - skb_dst_force_safe(skb); - - __skb_queue_tail(&tp->ucopy.prequeue, skb); - tp->ucopy.memory += skb->truesize; - if (skb_queue_len(&tp->ucopy.prequeue) >= 32 || - tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - struct sk_buff *skb1; - - BUG_ON(sock_owned_by_user(sk)); - __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED, - skb_queue_len(&tp->ucopy.prequeue)); - - while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb1); - - tp->ucopy.memory = 0; - } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { - wake_up_interruptible_sync_poll(sk_sleep(sk), - POLLIN | POLLRDNORM | POLLRDBAND); - if (!inet_csk_ack_scheduled(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - (3 * tcp_rto_min(sk)) / 4, - TCP_RTO_MAX); - } - return true; -} -EXPORT_SYMBOL(tcp_prequeue); - bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; @@ -1770,8 +1713,7 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v4_do_rcv(sk, skb); + ret = tcp_v4_do_rcv(sk, skb); } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } @@ -1936,9 +1878,6 @@ void tcp_v4_destroy_sock(struct sock *sk) } #endif - /* Clean prequeue, it must be empty really */ - __skb_queue_purge(&tp->ucopy.prequeue); - /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0ff83c1637d8..1537b87c657f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -436,8 +436,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_sock *newtp = tcp_sk(newsk); /* Now setup tcp_sock */ - newtp->pred_flags = 0; - newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; newtp->segs_in = 1; @@ -445,7 +443,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; - tcp_prequeue_init(newtp); INIT_LIST_HEAD(&newtp->tsq_node); tcp_init_wl(newtp, treq->rcv_isn); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2f1588bf73da..d49bff51bdb7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -295,9 +295,7 @@ static u16 tcp_select_window(struct sock *sk) /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; - /* If we advertise zero window, disable fast path. */ if (new_win == 0) { - tp->pred_flags = 0; if (old_win) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTOZEROWINDOWADV); @@ -2378,7 +2376,6 @@ bool tcp_schedule_loss_probe(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, tlp_time_stamp, rto_time_stamp; - u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); /* No consecutive loss probes. */ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { @@ -2407,15 +2404,19 @@ bool tcp_schedule_loss_probe(struct sock *sk) tcp_send_head(sk)) return false; - /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account + /* Probe timeout is 2*rtt. Add minimum RTO to account * for delayed ack when there's one outstanding packet. If no RTT * sample is available then probe after TCP_TIMEOUT_INIT. */ - timeout = rtt << 1 ? : TCP_TIMEOUT_INIT; - if (tp->packets_out == 1) - timeout = max_t(u32, timeout, - (rtt + (rtt >> 1) + TCP_DELACK_MAX)); - timeout = max_t(u32, timeout, msecs_to_jiffies(10)); + if (tp->srtt_us) { + timeout = usecs_to_jiffies(tp->srtt_us >> 2); + if (tp->packets_out == 1) + timeout += TCP_RTO_MIN; + else + timeout += TCP_TIMEOUT_MIN; + } else { + timeout = TCP_TIMEOUT_INIT; + } /* If RTO is shorter, just schedule TLP in its place. */ tlp_time_stamp = tcp_jiffies32 + timeout; diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index f6c50af24a64..697f4c67b2e3 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -105,8 +105,9 @@ static inline int tcp_probe_avail(void) * Note: arguments must match tcp_rcv_established()! */ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len) + const struct tcphdr *th) { + unsigned int len = skb->len; const struct tcp_sock *tp = tcp_sk(sk); const struct inet_sock *inet = inet_sk(sk); @@ -145,7 +146,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, BUG(); } - p->length = skb->len; + p->length = len; p->snd_nxt = tp->snd_nxt; p->snd_una = tp->snd_una; p->snd_cwnd = tp->snd_cwnd; diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index fe9a493d0208..449cd914d58e 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -113,7 +113,7 @@ void tcp_rack_mark_lost(struct sock *sk) tp->rack.advanced = 0; tcp_rack_detect_loss(sk, &timeout); if (timeout) { - timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN); + timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN; inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, timeout, inet_csk(sk)->icsk_rto); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c0feeeef962a..f753f9d2fee3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -239,7 +239,6 @@ static int tcp_write_timeout(struct sock *sk) /* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); sk_mem_reclaim_partial(sk); @@ -254,17 +253,6 @@ void tcp_delack_timer_handler(struct sock *sk) } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; - if (!skb_queue_empty(&tp->ucopy.prequeue)) { - struct sk_buff *skb; - - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); - - while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb); - - tp->ucopy.memory = 0; - } - if (inet_csk_ack_scheduled(sk)) { if (!icsk->icsk_ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index bec9cafbe3f9..e5de84310949 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -154,24 +154,6 @@ static inline void update_rtt_min(struct westwood *w) } /* - * @westwood_fast_bw - * It is called when we are in fast path. In particular it is called when - * header prediction is successful. In such case in fact update is - * straight forward and doesn't need any particular care. - */ -static inline void westwood_fast_bw(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct westwood *w = inet_csk_ca(sk); - - westwood_update_window(sk); - - w->bk += tp->snd_una - w->snd_una; - w->snd_una = tp->snd_una; - update_rtt_min(w); -} - -/* * @westwood_acked_count * This function evaluates cumul_ack for evaluating bk in case of * delayed or partial acks. @@ -223,17 +205,12 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) { - if (ack_flags & CA_ACK_SLOWPATH) { - struct westwood *w = inet_csk_ca(sk); - - westwood_update_window(sk); - w->bk += westwood_acked_count(sk); + struct westwood *w = inet_csk_ca(sk); - update_rtt_min(w); - return; - } + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); - westwood_fast_bw(sk); + update_rtt_min(w); } static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 781250151d40..97658bfc1b58 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, __be16 new_protocol, bool is_ipv6) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - bool remcsum, need_csum, offload_csum, ufo, gso_partial; + bool remcsum, need_csum, offload_csum, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; @@ -61,8 +61,6 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; - ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); - need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); /* Try to offload checksum if possible */ offload_csum = !!(need_csum && @@ -77,7 +75,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * outer one so strip the existing checksum feature flags and * instead set the flag based on our outer checksum offload value. */ - if (remcsum || ufo) { + if (remcsum) { features &= ~NETIF_F_CSUM_MASK; if (!need_csum || offload_csum) features |= NETIF_F_HW_CSUM; @@ -189,66 +187,16 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); -static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); - unsigned int mss; - __wsum csum; - struct udphdr *uh; - struct iphdr *iph; if (skb->encapsulation && (skb_shinfo(skb)->gso_type & - (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) segs = skb_udp_tunnel_segment(skb, features, false); - goto out; - } - if (!pskb_may_pull(skb, sizeof(struct udphdr))) - goto out; - - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; - - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { - /* Packet is from an untrusted source, reset gso_segs. */ - - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); - - segs = NULL; - goto out; - } - - /* Do software UFO. Complete and fill in the UDP checksum as - * HW cannot do checksum of UDP packets sent as multiple - * IP fragments. - */ - - uh = udp_hdr(skb); - iph = ip_hdr(skb); - - uh->check = 0; - csum = skb_checksum(skb, 0, skb->len, 0); - uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_NONE; - - /* If there is no outer header we can fake a checksum offload - * due to the fact that we have already done the checksum in - * software prior to segmenting the frame. - */ - if (!skb->encap_hdr_csum) - features |= NETIF_F_HW_CSUM; - - /* Fragment the skb. IP headers of the fragments are updated in - * inet_gso_segment() - */ - segs = skb_segment(skb, features); -out: return segs; } @@ -382,7 +330,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv4_offload = { .callbacks = { - .gso_segment = udp4_ufo_fragment, + .gso_segment = udp4_tunnel_segment, .gro_receive = udp4_gro_receive, .gro_complete = udp4_gro_complete, }, diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 58bd39fb14b4..6539ff15e9a3 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -82,7 +82,8 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, struct sock *sk = sock->sk; struct udp_tunnel_info ti; - if (!dev->netdev_ops->ndo_udp_tunnel_add) + if (!dev->netdev_ops->ndo_udp_tunnel_add || + !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; ti.type = type; @@ -93,6 +94,24 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); +void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, + unsigned short type) +{ + struct sock *sk = sock->sk; + struct udp_tunnel_info ti; + + if (!dev->netdev_ops->ndo_udp_tunnel_del || + !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); +} +EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port); + /* Notify netdevs that UDP port started listening */ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) { @@ -109,6 +128,8 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) for_each_netdev_rcu(net, dev) { if (!dev->netdev_ops->ndo_udp_tunnel_add) continue; + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + continue; dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); } rcu_read_unlock(); @@ -131,6 +152,8 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) for_each_netdev_rcu(net, dev) { if (!dev->netdev_ops->ndo_udp_tunnel_del) continue; + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + continue; dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); } rcu_read_unlock(); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 71b4ecc195c7..4aefb149fe0a 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -213,14 +213,6 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) fl4->flowi4_tos = iph->tos; } -static inline int xfrm4_garbage_collect(struct dst_ops *ops) -{ - struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); - - xfrm_garbage_collect_deferred(net); - return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); -} - static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu) { @@ -259,14 +251,13 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static struct dst_ops xfrm4_dst_ops_template = { .family = AF_INET, - .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, .redirect = xfrm4_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, - .gc_thresh = INT_MAX, + .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a88b5b5b7955..0a7c74049a0c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -210,7 +210,7 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->autoflowlabel = ip6_default_np_autolabel(sock_net(sk)); + np->autoflowlabel = ip6_default_np_autolabel(net); sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 162efba0d0cd..43ca864327c7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1110,69 +1110,6 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, } EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); -static inline int ip6_ufo_append_data(struct sock *sk, - struct sk_buff_head *queue, - int getfrag(void *from, char *to, int offset, int len, - int odd, struct sk_buff *skb), - void *from, int length, int hh_len, int fragheaderlen, - int exthdrlen, int transhdrlen, int mtu, - unsigned int flags, const struct flowi6 *fl6) - -{ - struct sk_buff *skb; - int err; - - /* There is support for UDP large send offload by network - * device, so create one single skb packet containing complete - * udp datagram - */ - skb = skb_peek_tail(queue); - if (!skb) { - skb = sock_alloc_send_skb(sk, - hh_len + fragheaderlen + transhdrlen + 20, - (flags & MSG_DONTWAIT), &err); - if (!skb) - return err; - - /* reserve space for Hardware header */ - skb_reserve(skb, hh_len); - - /* create space for UDP/IP header */ - skb_put(skb, fragheaderlen + transhdrlen); - - /* initialize network header pointer */ - skb_set_network_header(skb, exthdrlen); - - /* initialize protocol header pointer */ - skb->transport_header = skb->network_header + fragheaderlen; - - skb->protocol = htons(ETH_P_IPV6); - skb->csum = 0; - - if (flags & MSG_CONFIRM) - skb_set_dst_pending_confirm(skb, 1); - - __skb_queue_tail(queue, skb); - } else if (skb_is_gso(skb)) { - goto append; - } - - skb->ip_summed = CHECKSUM_PARTIAL; - /* Specify the length of each IPv6 datagram fragment. - * It has to be a multiple of 8. - */ - skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - - sizeof(struct frag_hdr)) & ~7; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk), - &fl6->daddr, - &fl6->saddr); - -append: - return skb_append_datato_frags(sk, skb, getfrag, from, - (length - transhdrlen)); -} - static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, gfp_t gfp) { @@ -1381,19 +1318,6 @@ emsgsize: */ cork->length += length; - if ((((length + (skb ? skb->len : headersize)) > mtu) || - (skb && skb_is_gso(skb))) && - (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && - (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { - err = ip6_ufo_append_data(sk, queue, getfrag, from, length, - hh_len, fragheaderlen, exthdrlen, - transhdrlen, mtu, flags, fl6); - if (err) - goto error; - return 0; - } - if (!skb) goto alloc_new_skb; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 486c2305f53c..79444a4bfd6d 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1145,33 +1145,6 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { .priority = 100, }; -static bool is_vti6_tunnel(const struct net_device *dev) -{ - return dev->netdev_ops == &vti6_netdev_ops; -} - -static int vti6_device_event(struct notifier_block *unused, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct ip6_tnl *t = netdev_priv(dev); - - if (!is_vti6_tunnel(dev)) - return NOTIFY_DONE; - - switch (event) { - case NETDEV_DOWN: - if (!net_eq(t->net, dev_net(dev))) - xfrm_garbage_collect(t->net); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block vti6_notifier_block __read_mostly = { - .notifier_call = vti6_device_event, -}; - /** * vti6_tunnel_init - register protocol and reserve needed resources * @@ -1182,8 +1155,6 @@ static int __init vti6_tunnel_init(void) const char *msg; int err; - register_netdevice_notifier(&vti6_notifier_block); - msg = "tunnel device"; err = register_pernet_device(&vti6_net_ops); if (err < 0) @@ -1216,7 +1187,6 @@ xfrm_proto_ah_failed: xfrm_proto_esp_failed: unregister_pernet_device(&vti6_net_ops); pernet_dev_failed: - unregister_netdevice_notifier(&vti6_notifier_block); pr_err("vti6 init: failed to register %s\n", msg); return err; } @@ -1231,7 +1201,6 @@ static void __exit vti6_tunnel_cleanup(void) xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); unregister_pernet_device(&vti6_net_ops); - unregister_netdevice_notifier(&vti6_notifier_block); } module_init(vti6_tunnel_init); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2521690d62d6..ced5dcf37465 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1296,7 +1296,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } } - tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); + tcp_rcv_established(sk, skb, tcp_hdr(skb)); if (opt_skb) goto ipv6_pktoptions; return 0; @@ -1505,8 +1505,7 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); + ret = tcp_v6_do_rcv(sk, skb); } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index a2267f80febb..455fd4e39333 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -17,109 +17,15 @@ #include <net/ip6_checksum.h> #include "ip6_offload.h" -static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); - unsigned int mss; - unsigned int unfrag_ip6hlen, unfrag_len; - struct frag_hdr *fptr; - u8 *packet_start, *prevhdr; - u8 nexthdr; - u8 frag_hdr_sz = sizeof(struct frag_hdr); - __wsum csum; - int tnl_hlen; - int err; - - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; - - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { - /* Packet is from an untrusted source, reset gso_segs. */ - - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); - - /* Set the IPv6 fragment id if not set yet */ - if (!skb_shinfo(skb)->ip6_frag_id) - ipv6_proxy_select_ident(dev_net(skb->dev), skb); - - segs = NULL; - goto out; - } if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) segs = skb_udp_tunnel_segment(skb, features, true); - else { - const struct ipv6hdr *ipv6h; - struct udphdr *uh; - - if (!pskb_may_pull(skb, sizeof(struct udphdr))) - goto out; - - /* Do software UFO. Complete and fill in the UDP checksum as HW cannot - * do checksum of UDP packets sent as multiple IP fragments. - */ - - uh = udp_hdr(skb); - ipv6h = ipv6_hdr(skb); - - uh->check = 0; - csum = skb_checksum(skb, 0, skb->len, 0); - uh->check = udp_v6_check(skb->len, &ipv6h->saddr, - &ipv6h->daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_NONE; - - /* If there is no outer header we can fake a checksum offload - * due to the fact that we have already done the checksum in - * software prior to segmenting the frame. - */ - if (!skb->encap_hdr_csum) - features |= NETIF_F_HW_CSUM; - - /* Check if there is enough headroom to insert fragment header. */ - tnl_hlen = skb_tnl_header_len(skb); - if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { - if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) - goto out; - } - - /* Find the unfragmentable header and shift it left by frag_hdr_sz - * bytes to insert fragment header. - */ - err = ip6_find_1stfragopt(skb, &prevhdr); - if (err < 0) - return ERR_PTR(err); - unfrag_ip6hlen = err; - nexthdr = *prevhdr; - *prevhdr = NEXTHDR_FRAGMENT; - unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + - unfrag_ip6hlen + tnl_hlen; - packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; - memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); - - SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; - skb->mac_header -= frag_hdr_sz; - skb->network_header -= frag_hdr_sz; - - fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); - fptr->nexthdr = nexthdr; - fptr->reserved = 0; - if (!skb_shinfo(skb)->ip6_frag_id) - ipv6_proxy_select_ident(dev_net(skb->dev), skb); - fptr->identification = skb_shinfo(skb)->ip6_frag_id; - - /* Fragment the skb. ipv6 header and the remaining fields of the - * fragment header are updated in ipv6_gso_segment() - */ - segs = skb_segment(skb, features); - } -out: return segs; } @@ -169,7 +75,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv6_offload = { .callbacks = { - .gso_segment = udp6_ufo_fragment, + .gso_segment = udp6_tunnel_segment, .gro_receive = udp6_gro_receive, .gro_complete = udp6_gro_complete, }, diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 79651bc71bf0..f44b25a48478 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -214,14 +214,6 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) } } -static inline int xfrm6_garbage_collect(struct dst_ops *ops) -{ - struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); - - xfrm_garbage_collect_deferred(net); - return dst_entries_get_fast(ops) > ops->gc_thresh * 2; -} - static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu) { @@ -279,14 +271,13 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static struct dst_ops xfrm6_dst_ops_template = { .family = AF_INET6, - .gc = xfrm6_garbage_collect, .update_pmtu = xfrm6_update_pmtu, .redirect = xfrm6_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, - .gc_thresh = INT_MAX, + .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = { diff --git a/net/key/af_key.c b/net/key/af_key.c index ca9d3ae665e7..10d7133e4fe9 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2398,8 +2398,6 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa out: xfrm_pol_put(xp); - if (err == 0) - xfrm_garbage_collect(net); return err; } @@ -2650,8 +2648,6 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_ out: xfrm_pol_put(xp); - if (delete && err == 0) - xfrm_garbage_collect(net); return err; } @@ -2751,8 +2747,6 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sad int err, err2; err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true); - if (!err) - xfrm_garbage_collect(net); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - old silent behavior */ diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 3f6c4fa78bdb..245fa350a7a8 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -106,7 +106,7 @@ static DEFINE_SPINLOCK(recent_lock); static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS -static const struct file_operations recent_old_fops, recent_mt_fops; +static const struct file_operations recent_mt_fops; #endif static u_int32_t hash_rnd __read_mostly; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 45fe8c8a884d..f6e229b51dfb 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -335,8 +335,6 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { - unsigned short gso_type = skb_shinfo(skb)->gso_type; - struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; @@ -347,21 +345,9 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, if (segs == NULL) return -EINVAL; - if (gso_type & SKB_GSO_UDP) { - /* The initial flow key extracted by ovs_flow_key_extract() - * in this case is for a first fragment, so we need to - * properly mark later fragments. - */ - later_key = *key; - later_key.ip.frag = OVS_FRAG_TYPE_LATER; - } - /* Queue all of the segments. */ skb = segs; do { - if (gso_type & SKB_GSO_UDP && skb != segs) - key = &later_key; - err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 3f76cb765e5b..8c94cef25a72 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -72,8 +72,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, const struct sk_buff *skb) { struct flow_stats *stats; - int node = numa_node_id(); - int cpu = smp_processor_id(); + unsigned int cpu = smp_processor_id(); int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); stats = rcu_dereference(flow->stats[cpu]); @@ -108,7 +107,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, __GFP_THISNODE | __GFP_NOWARN | __GFP_NOMEMALLOC, - node); + numa_node_id()); if (likely(new_stats)) { new_stats->used = jiffies; new_stats->packet_count = 1; @@ -118,6 +117,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, rcu_assign_pointer(flow->stats[cpu], new_stats); + cpumask_set_cpu(cpu, &flow->cpu_used_mask); goto unlock; } } @@ -145,7 +145,7 @@ void ovs_flow_stats_get(const struct sw_flow *flow, memset(ovs_stats, 0, sizeof(*ovs_stats)); /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) { + for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]); if (stats) { @@ -169,7 +169,7 @@ void ovs_flow_stats_clear(struct sw_flow *flow) int cpu; /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) { + for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]); if (stats) { @@ -584,8 +584,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_LATER; return 0; } - if (nh->frag_off & htons(IP_MF) || - skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + if (nh->frag_off & htons(IP_MF)) key->ip.frag = OVS_FRAG_TYPE_FIRST; else key->ip.frag = OVS_FRAG_TYPE_NONE; @@ -701,9 +700,6 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) if (key->ip.frag == OVS_FRAG_TYPE_LATER) return 0; - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) - key->ip.frag = OVS_FRAG_TYPE_FIRST; - /* Transport layer. */ if (key->ip.proto == NEXTHDR_TCP) { if (tcphdr_ok(skb)) { diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index a9bc1c875965..1875bba4f865 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -31,6 +31,7 @@ #include <linux/jiffies.h> #include <linux/time.h> #include <linux/flex_array.h> +#include <linux/cpumask.h> #include <net/inet_ecn.h> #include <net/ip_tunnels.h> #include <net/dst_metadata.h> @@ -219,6 +220,7 @@ struct sw_flow { */ struct sw_flow_key key; struct sw_flow_id id; + struct cpumask cpu_used_mask; struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; struct flow_stats __rcu *stats[]; /* One for each CPU. First one diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index ea7a8073fa02..80ea2a71852e 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -98,6 +98,8 @@ struct sw_flow *ovs_flow_alloc(void) RCU_INIT_POINTER(flow->stats[0], stats); + cpumask_set_cpu(0, &flow->cpu_used_mask); + return flow; err: kmem_cache_free(flow_cache, flow); @@ -141,7 +143,7 @@ static void flow_free(struct sw_flow *flow) if (flow->sf_acts) ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) + for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, (struct flow_stats __force *)flow->stats[cpu]); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 0615c2a950fa..5a178047a7ce 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -177,8 +177,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, #define BLK_PLUS_PRIV(sz_of_priv) \ (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) -#define PGV_FROM_VMALLOC 1 - #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) diff --git a/net/rds/connection.c b/net/rds/connection.c index 50a3789ac23e..005bca68aa94 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -374,13 +374,13 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) if (!cp->cp_transport_data) return; - rds_conn_path_drop(cp); - flush_work(&cp->cp_down_w); - /* make sure lingering queued work won't try to ref the conn */ cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_recv_w); + rds_conn_path_drop(cp, true); + flush_work(&cp->cp_down_w); + /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, &cp->cp_send_queue, @@ -664,9 +664,13 @@ void rds_conn_exit(void) /* * Force a disconnect */ -void rds_conn_path_drop(struct rds_conn_path *cp) +void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) { atomic_set(&cp->cp_state, RDS_CONN_ERROR); + + if (!destroy && cp->cp_conn->c_destroy_in_prog) + return; + queue_work(rds_wq, &cp->cp_down_w); } EXPORT_SYMBOL_GPL(rds_conn_path_drop); @@ -674,7 +678,7 @@ EXPORT_SYMBOL_GPL(rds_conn_path_drop); void rds_conn_drop(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); - rds_conn_path_drop(&conn->c_path[0]); + rds_conn_path_drop(&conn->c_path[0], false); } EXPORT_SYMBOL_GPL(rds_conn_drop); @@ -706,5 +710,5 @@ __rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...) vprintk(fmt, ap); va_end(ap); - rds_conn_path_drop(cp); + rds_conn_path_drop(cp, false); } diff --git a/net/rds/rds.h b/net/rds/rds.h index 516bcc89b46f..3382695bf46c 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -700,7 +700,7 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net, void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); -void rds_conn_path_drop(struct rds_conn_path *cpath); +void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy); void rds_conn_connect_if_down(struct rds_connection *conn); void rds_conn_path_connect_if_down(struct rds_conn_path *cp); void rds_for_each_conn_info(struct socket *sock, unsigned int len, diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 431404dbdad1..6b7ee71f40c6 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -592,7 +592,7 @@ static void rds_tcp_sysctl_reset(struct net *net) continue; /* reconnect with new parameters */ - rds_conn_path_drop(tc->t_cpath); + rds_conn_path_drop(tc->t_cpath, false); } spin_unlock_irq(&rds_tcp_conn_lock); } diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index cbe08a1fa4c7..46f74dad0e16 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -69,14 +69,14 @@ void rds_tcp_state_change(struct sock *sk) if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) && rds_conn_path_transition(cp, RDS_CONN_CONNECTING, RDS_CONN_ERROR)) { - rds_conn_path_drop(cp); + rds_conn_path_drop(cp, false); } else { rds_connect_path_complete(cp, RDS_CONN_CONNECTING); } break; case TCP_CLOSE_WAIT: case TCP_CLOSE: - rds_conn_path_drop(cp); + rds_conn_path_drop(cp, false); default: break; } diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 0d8616aa5bad..dc860d1bb608 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -157,7 +157,7 @@ out: "returned %d, " "disconnecting and reconnecting\n", &conn->c_faddr, cp->cp_index, ret); - rds_conn_path_drop(cp); + rds_conn_path_drop(cp, false); } } } diff --git a/net/rds/threads.c b/net/rds/threads.c index 2852bc1d37d4..f121daa402c8 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -78,7 +78,7 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr) "current state is %d\n", __func__, atomic_read(&cp->cp_state)); - rds_conn_path_drop(cp); + rds_conn_path_drop(cp, false); return; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 69b97339ff9d..8c0db9b3e4ab 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -15,7 +15,7 @@ #include <net/netns/generic.h> #include <net/sock.h> #include <net/af_rxrpc.h> -#include <rxrpc/packet.h> +#include "protocol.h" #if 0 #define CHECK_SLAB_OKAY(X) \ diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h new file mode 100644 index 000000000000..4bddcf3face3 --- /dev/null +++ b/net/rxrpc/protocol.h @@ -0,0 +1,190 @@ +/* packet.h: Rx packet layout and definitions + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _LINUX_RXRPC_PACKET_H +#define _LINUX_RXRPC_PACKET_H + +typedef u32 rxrpc_seq_t; /* Rx message sequence number */ +typedef u32 rxrpc_serial_t; /* Rx message serial number */ +typedef __be32 rxrpc_seq_net_t; /* on-the-wire Rx message sequence number */ +typedef __be32 rxrpc_serial_net_t; /* on-the-wire Rx message serial number */ + +/*****************************************************************************/ +/* + * on-the-wire Rx packet header + * - all multibyte fields should be in network byte order + */ +struct rxrpc_wire_header { + __be32 epoch; /* client boot timestamp */ +#define RXRPC_RANDOM_EPOCH 0x80000000 /* Random if set, date-based if not */ + + __be32 cid; /* connection and channel ID */ +#define RXRPC_MAXCALLS 4 /* max active calls per conn */ +#define RXRPC_CHANNELMASK (RXRPC_MAXCALLS-1) /* mask for channel ID */ +#define RXRPC_CIDMASK (~RXRPC_CHANNELMASK) /* mask for connection ID */ +#define RXRPC_CIDSHIFT ilog2(RXRPC_MAXCALLS) /* shift for connection ID */ +#define RXRPC_CID_INC (1 << RXRPC_CIDSHIFT) /* connection ID increment */ + + __be32 callNumber; /* call ID (0 for connection-level packets) */ + __be32 seq; /* sequence number of pkt in call stream */ + __be32 serial; /* serial number of pkt sent to network */ + + uint8_t type; /* packet type */ +#define RXRPC_PACKET_TYPE_DATA 1 /* data */ +#define RXRPC_PACKET_TYPE_ACK 2 /* ACK */ +#define RXRPC_PACKET_TYPE_BUSY 3 /* call reject */ +#define RXRPC_PACKET_TYPE_ABORT 4 /* call/connection abort */ +#define RXRPC_PACKET_TYPE_ACKALL 5 /* ACK all outstanding packets on call */ +#define RXRPC_PACKET_TYPE_CHALLENGE 6 /* connection security challenge (SRVR->CLNT) */ +#define RXRPC_PACKET_TYPE_RESPONSE 7 /* connection secutity response (CLNT->SRVR) */ +#define RXRPC_PACKET_TYPE_DEBUG 8 /* debug info request */ +#define RXRPC_PACKET_TYPE_VERSION 13 /* version string request */ +#define RXRPC_N_PACKET_TYPES 14 /* number of packet types (incl type 0) */ + + uint8_t flags; /* packet flags */ +#define RXRPC_CLIENT_INITIATED 0x01 /* signifies a packet generated by a client */ +#define RXRPC_REQUEST_ACK 0x02 /* request an unconditional ACK of this packet */ +#define RXRPC_LAST_PACKET 0x04 /* the last packet from this side for this call */ +#define RXRPC_MORE_PACKETS 0x08 /* more packets to come */ +#define RXRPC_JUMBO_PACKET 0x20 /* [DATA] this is a jumbo packet */ +#define RXRPC_SLOW_START_OK 0x20 /* [ACK] slow start supported */ + + uint8_t userStatus; /* app-layer defined status */ +#define RXRPC_USERSTATUS_SERVICE_UPGRADE 0x01 /* AuriStor service upgrade request */ + + uint8_t securityIndex; /* security protocol ID */ + union { + __be16 _rsvd; /* reserved */ + __be16 cksum; /* kerberos security checksum */ + }; + __be16 serviceId; /* service ID */ + +} __packed; + +#define RXRPC_SUPPORTED_PACKET_TYPES ( \ + (1 << RXRPC_PACKET_TYPE_DATA) | \ + (1 << RXRPC_PACKET_TYPE_ACK) | \ + (1 << RXRPC_PACKET_TYPE_BUSY) | \ + (1 << RXRPC_PACKET_TYPE_ABORT) | \ + (1 << RXRPC_PACKET_TYPE_ACKALL) | \ + (1 << RXRPC_PACKET_TYPE_CHALLENGE) | \ + (1 << RXRPC_PACKET_TYPE_RESPONSE) | \ + /*(1 << RXRPC_PACKET_TYPE_DEBUG) | */ \ + (1 << RXRPC_PACKET_TYPE_VERSION)) + +/*****************************************************************************/ +/* + * jumbo packet secondary header + * - can be mapped to read header by: + * - new_serial = serial + 1 + * - new_seq = seq + 1 + * - new_flags = j_flags + * - new__rsvd = j__rsvd + * - duplicating all other fields + */ +struct rxrpc_jumbo_header { + uint8_t flags; /* packet flags (as per rxrpc_header) */ + uint8_t pad; + union { + __be16 _rsvd; /* reserved */ + __be16 cksum; /* kerberos security checksum */ + }; +}; + +#define RXRPC_JUMBO_DATALEN 1412 /* non-terminal jumbo packet data length */ +#define RXRPC_JUMBO_SUBPKTLEN (RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header)) + +/*****************************************************************************/ +/* + * on-the-wire Rx ACK packet data payload + * - all multibyte fields should be in network byte order + */ +struct rxrpc_ackpacket { + __be16 bufferSpace; /* number of packet buffers available */ + __be16 maxSkew; /* diff between serno being ACK'd and highest serial no + * received */ + __be32 firstPacket; /* sequence no of first ACK'd packet in attached list */ + __be32 previousPacket; /* sequence no of previous packet received */ + __be32 serial; /* serial no of packet that prompted this ACK */ + + uint8_t reason; /* reason for ACK */ +#define RXRPC_ACK_REQUESTED 1 /* ACK was requested on packet */ +#define RXRPC_ACK_DUPLICATE 2 /* duplicate packet received */ +#define RXRPC_ACK_OUT_OF_SEQUENCE 3 /* out of sequence packet received */ +#define RXRPC_ACK_EXCEEDS_WINDOW 4 /* packet received beyond end of ACK window */ +#define RXRPC_ACK_NOSPACE 5 /* packet discarded due to lack of buffer space */ +#define RXRPC_ACK_PING 6 /* keep alive ACK */ +#define RXRPC_ACK_PING_RESPONSE 7 /* response to RXRPC_ACK_PING */ +#define RXRPC_ACK_DELAY 8 /* nothing happened since received packet */ +#define RXRPC_ACK_IDLE 9 /* ACK due to fully received ACK window */ +#define RXRPC_ACK__INVALID 10 /* Representation of invalid ACK reason */ + + uint8_t nAcks; /* number of ACKs */ +#define RXRPC_MAXACKS 255 + + uint8_t acks[0]; /* list of ACK/NAKs */ +#define RXRPC_ACK_TYPE_NACK 0 +#define RXRPC_ACK_TYPE_ACK 1 + +} __packed; + +/* Some ACKs refer to specific packets and some are general and can be updated. */ +#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED) | \ + (1 << RXRPC_ACK_PING_RESPONSE) | \ + (1 << RXRPC_ACK_DELAY) | \ + (1 << RXRPC_ACK_IDLE)) + + +/* + * ACK packets can have a further piece of information tagged on the end + */ +struct rxrpc_ackinfo { + __be32 rxMTU; /* maximum Rx MTU size (bytes) [AFS 3.3] */ + __be32 maxMTU; /* maximum interface MTU size (bytes) [AFS 3.3] */ + __be32 rwind; /* Rx window size (packets) [AFS 3.4] */ + __be32 jumbo_max; /* max packets to stick into a jumbo packet [AFS 3.5] */ +}; + +/*****************************************************************************/ +/* + * Kerberos security type-2 challenge packet + */ +struct rxkad_challenge { + __be32 version; /* version of this challenge type */ + __be32 nonce; /* encrypted random number */ + __be32 min_level; /* minimum security level */ + __be32 __padding; /* padding to 8-byte boundary */ +} __packed; + +/*****************************************************************************/ +/* + * Kerberos security type-2 response packet + */ +struct rxkad_response { + __be32 version; /* version of this response type */ + __be32 __pad; + + /* encrypted bit of the response */ + struct { + __be32 epoch; /* current epoch */ + __be32 cid; /* parent connection ID */ + __be32 checksum; /* checksum */ + __be32 securityIndex; /* security type */ + __be32 call_id[4]; /* encrypted call IDs */ + __be32 inc_nonce; /* challenge nonce + 1 */ + __be32 level; /* desired level */ + } encrypted; + + __be32 kvno; /* Kerberos key version number */ + __be32 ticket_len; /* Kerberos ticket length */ +} __packed; + +#endif /* _LINUX_RXRPC_PACKET_H */ diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f2e9ed34a963..f19b118df414 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -110,6 +110,8 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, struct netlink_callback *cb) { int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; + u32 act_flags = cb->args[2]; + unsigned long jiffy_since = cb->args[3]; struct nlattr *nest; spin_lock_bh(&hinfo->lock); @@ -127,6 +129,11 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, if (index < s_i) continue; + if (jiffy_since && + time_after(jiffy_since, + (unsigned long)p->tcfa_tm.lastuse)) + continue; + nest = nla_nest_start(skb, n_i); if (nest == NULL) goto nla_put_failure; @@ -138,14 +145,20 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, } nla_nest_end(skb, nest); n_i++; - if (n_i >= TCA_ACT_MAX_PRIO) + if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) && + n_i >= TCA_ACT_MAX_PRIO) goto done; } } done: + if (index >= 0) + cb->args[0] = index + 1; + spin_unlock_bh(&hinfo->lock); - if (n_i) - cb->args[0] += n_i; + if (n_i) { + if (act_flags & TCA_FLAG_LARGE_DUMP_ON) + cb->args[1] = n_i; + } return n_i; nla_put_failure: @@ -1068,11 +1081,18 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, return tcf_add_notify(net, n, &actions, portid); } +static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; +static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { + [TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32, + .validation_data = &tcaa_root_flags_allowed }, + [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, +}; + static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct nlattr *tca[TCA_ACT_MAX + 1]; + struct nlattr *tca[TCA_ROOT_MAX + 1]; u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; @@ -1080,7 +1100,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL, + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL, extack); if (ret < 0) return ret; @@ -1121,16 +1141,12 @@ replay: return ret; } -static struct nlattr *find_dump_kind(const struct nlmsghdr *n) +static struct nlattr *find_dump_kind(struct nlattr **nla) { struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; - struct nlattr *nla[TCAA_MAX + 1]; struct nlattr *kind; - if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, - NULL, NULL) < 0) - return NULL; tb1 = nla[TCA_ACT_TAB]; if (tb1 == NULL) return NULL; @@ -1157,8 +1173,20 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) struct tc_action_ops *a_o; int ret = 0; struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); - struct nlattr *kind = find_dump_kind(cb->nlh); + struct nlattr *tb[TCA_ROOT_MAX + 1]; + struct nlattr *count_attr = NULL; + unsigned long jiffy_since = 0; + struct nlattr *kind = NULL; + struct nla_bitfield32 bf; + u32 msecs_since = 0; + u32 act_count = 0; + + ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX, + tcaa_policy, NULL); + if (ret < 0) + return ret; + kind = find_dump_kind(tb); if (kind == NULL) { pr_info("tc_dump_action: action bad kind\n"); return 0; @@ -1168,14 +1196,32 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (a_o == NULL) return 0; + cb->args[2] = 0; + if (tb[TCA_ROOT_FLAGS]) { + bf = nla_get_bitfield32(tb[TCA_ROOT_FLAGS]); + cb->args[2] = bf.value; + } + + if (tb[TCA_ROOT_TIME_DELTA]) { + msecs_since = nla_get_u32(tb[TCA_ROOT_TIME_DELTA]); + } + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) goto out_module_put; + + if (msecs_since) + jiffy_since = jiffies - msecs_to_jiffies(msecs_since); + t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; + cb->args[3] = jiffy_since; + count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32)); + if (!count_attr) + goto out_module_put; nest = nla_nest_start(skb, TCA_ACT_TAB); if (nest == NULL) @@ -1188,6 +1234,9 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (ret > 0) { nla_nest_end(skb, nest); ret = skb->len; + act_count = cb->args[1]; + memcpy(nla_data(count_attr), &act_count, sizeof(u32)); + cb->args[1] = 0; } else nlmsg_trim(skb, b); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 3317a2f579da..67afc12df88b 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -231,9 +231,6 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl, const struct iphdr *iph; u16 ul; - if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) - return 1; - /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, @@ -287,9 +284,6 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl, const struct ipv6hdr *ip6h; u16 ul; - if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) - return 1; - /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, diff --git a/net/sctp/auth.c b/net/sctp/auth.c index e001b01b0e68..00667c50efa7 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -185,9 +185,9 @@ static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1, * are called the two key vectors. */ static struct sctp_auth_bytes *sctp_auth_make_key_vector( - sctp_random_param_t *random, - sctp_chunks_param_t *chunks, - sctp_hmac_algo_param_t *hmacs, + struct sctp_random_param *random, + struct sctp_chunks_param *chunks, + struct sctp_hmac_algo_param *hmacs, gfp_t gfp) { struct sctp_auth_bytes *new; @@ -226,10 +226,9 @@ static struct sctp_auth_bytes *sctp_auth_make_local_vector( gfp_t gfp) { return sctp_auth_make_key_vector( - (sctp_random_param_t *)asoc->c.auth_random, - (sctp_chunks_param_t *)asoc->c.auth_chunks, - (sctp_hmac_algo_param_t *)asoc->c.auth_hmacs, - gfp); + (struct sctp_random_param *)asoc->c.auth_random, + (struct sctp_chunks_param *)asoc->c.auth_chunks, + (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs, gfp); } /* Make a key vector based on peer's parameters */ diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 1323d41e68b8..681b181e7ae3 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -221,7 +221,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && msg_len > max_data) - first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t)); + first_len -= SCTP_PAD4(sizeof(struct sctp_sack_chunk)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 0e86f988f836..3d506b2f6193 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -73,13 +73,13 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. */ - auth_hmacs = kzalloc(sizeof(sctp_hmac_algo_param_t) + - sizeof(__u16) * SCTP_AUTH_NUM_HMACS, gfp); + auth_hmacs = kzalloc(sizeof(*auth_hmacs) + + sizeof(__u16) * SCTP_AUTH_NUM_HMACS, gfp); if (!auth_hmacs) goto nomem; - auth_chunks = kzalloc(sizeof(sctp_chunks_param_t) + - SCTP_NUM_CHUNK_TYPES, gfp); + auth_chunks = kzalloc(sizeof(*auth_chunks) + + SCTP_NUM_CHUNK_TYPES, gfp); if (!auth_chunks) goto nomem; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 2a186b201ad2..107d7c912922 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -497,7 +497,7 @@ static void sctp_v6_from_addr_param(union sctp_addr *addr, static int sctp_v6_to_addr_param(const union sctp_addr *addr, union sctp_addr_param *param) { - int length = sizeof(sctp_ipv6addr_param_t); + int length = sizeof(struct sctp_ipv6addr_param); param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS; param->v6.param_hdr.length = htons(length); diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index e8762702a313..d2a8adfd4a6f 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1197,7 +1197,7 @@ sctp_flush_out: static void sctp_sack_update_unack_data(struct sctp_association *assoc, struct sctp_sackhdr *sack) { - sctp_sack_variable_t *frags; + union sctp_sack_variable *frags; __u16 unack_data; int i; @@ -1224,7 +1224,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) struct sctp_transport *transport; struct sctp_chunk *tchunk = NULL; struct list_head *lchunk, *transport_list, *temp; - sctp_sack_variable_t *frags = sack->variable; + union sctp_sack_variable *frags = sack->variable; __u32 sack_ctsn, ctsn, tsn; __u32 highest_tsn, highest_new_tsn; __u32 sack_a_rwnd; @@ -1736,10 +1736,10 @@ static void sctp_mark_missing(struct sctp_outq *q, /* Is the given TSN acked by this packet? */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) { - int i; - sctp_sack_variable_t *frags; - __u16 tsn_offset, blocks; __u32 ctsn = ntohl(sack->cum_tsn_ack); + union sctp_sack_variable *frags; + __u16 tsn_offset, blocks; + int i; if (TSN_lte(tsn, ctsn)) goto pass; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 989a900383b5..852556d67ae3 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -292,7 +292,7 @@ static void sctp_v4_from_addr_param(union sctp_addr *addr, static int sctp_v4_to_addr_param(const union sctp_addr *addr, union sctp_addr_param *param) { - int length = sizeof(sctp_ipv4addr_param_t); + int length = sizeof(struct sctp_ipv4addr_param); param->v4.param_hdr.type = SCTP_PARAM_IPV4_ADDRESS; param->v4.param_hdr.length = htons(length); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 6110447fe51d..163004e7047c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -69,7 +69,8 @@ static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); -static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, +static struct sctp_cookie_param *sctp_pack_cookie( + const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, @@ -223,10 +224,10 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, struct sctp_chunk *retval = NULL; int num_types, addrs_len = 0; struct sctp_sock *sp; - sctp_supported_addrs_param_t sat; + struct sctp_supported_addrs_param sat; __be16 types[2]; - sctp_adaptation_ind_param_t aiparam; - sctp_supported_ext_param_t ext_param; + struct sctp_adaptation_ind_param aiparam; + struct sctp_supported_ext_param ext_param; int num_ext = 0; __u8 extensions[4]; struct sctp_paramhdr *auth_chunks = NULL, @@ -305,8 +306,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, /* If we have any extensions to report, account for that */ if (num_ext) - chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) + - num_ext); + chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* RFC 2960 3.3.2 Initiation (INIT) (1) * @@ -348,10 +348,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, */ if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; - ext_param.param_hdr.length = - htons(sizeof(sctp_supported_ext_param_t) + num_ext); - sctp_addto_chunk(retval, sizeof(sctp_supported_ext_param_t), - &ext_param); + ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); + sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } @@ -390,11 +388,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, union sctp_params addrs; struct sctp_sock *sp; int addrs_len; - sctp_cookie_param_t *cookie; + struct sctp_cookie_param *cookie; int cookie_len; size_t chunksize; - sctp_adaptation_ind_param_t aiparam; - sctp_supported_ext_param_t ext_param; + struct sctp_adaptation_ind_param aiparam; + struct sctp_supported_ext_param ext_param; int num_ext = 0; __u8 extensions[4]; struct sctp_paramhdr *auth_chunks = NULL, @@ -468,8 +466,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, } if (num_ext) - chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) + - num_ext); + chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* Now allocate and fill out the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); @@ -495,10 +492,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; - ext_param.param_hdr.length = - htons(sizeof(sctp_supported_ext_param_t) + num_ext); - sctp_addto_chunk(retval, sizeof(sctp_supported_ext_param_t), - &ext_param); + ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); + sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->peer.prsctp_capable) @@ -1601,14 +1596,15 @@ nodata: /* Build a cookie representing asoc. * This INCLUDES the param header needed to put the cookie in the INIT ACK. */ -static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, - const struct sctp_association *asoc, - const struct sctp_chunk *init_chunk, - int *cookie_len, - const __u8 *raw_addrs, int addrs_len) +static struct sctp_cookie_param *sctp_pack_cookie( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const struct sctp_chunk *init_chunk, + int *cookie_len, + const __u8 *raw_addrs, int addrs_len) { - sctp_cookie_param_t *retval; struct sctp_signed_cookie *cookie; + struct sctp_cookie_param *retval; int headersize, bodysize; /* Header size is static data prior to the actual cookie, including @@ -3153,7 +3149,7 @@ bool sctp_verify_asconf(const struct sctp_association *asoc, case SCTP_PARAM_ERR_CAUSE: break; case SCTP_PARAM_IPV4_ADDRESS: - if (length != sizeof(sctp_ipv4addr_param_t)) + if (length != sizeof(struct sctp_ipv4addr_param)) return false; /* ensure there is only one addr param and it's in the * beginning of addip_hdr params, or we reject it. @@ -3163,7 +3159,7 @@ bool sctp_verify_asconf(const struct sctp_association *asoc, addr_param_seen = true; break; case SCTP_PARAM_IPV6_ADDRESS: - if (length != sizeof(sctp_ipv6addr_param_t)) + if (length != sizeof(struct sctp_ipv6addr_param)) return false; if (param.v != addip->addip_hdr.params) return false; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index b2a74c3823ee..dc0c2c4188d8 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -306,12 +306,10 @@ sctp_disposition_t sctp_sf_do_5_1B_init(struct net *net, void *arg, sctp_cmd_seq_t *commands) { - struct sctp_chunk *chunk = arg; - struct sctp_chunk *repl; + struct sctp_chunk *chunk = arg, *repl, *err_chunk; + struct sctp_unrecognized_param *unk_param; struct sctp_association *new_asoc; - struct sctp_chunk *err_chunk; struct sctp_packet *packet; - sctp_unrecognized_param_t *unk_param; int len; /* 6.10 Bundling @@ -435,7 +433,7 @@ sctp_disposition_t sctp_sf_do_5_1B_init(struct net *net, * construct the parameters in INIT ACK by copying the * ERROR causes over. */ - unk_param = (sctp_unrecognized_param_t *) + unk_param = (struct sctp_unrecognized_param *) ((__u8 *)(err_chunk->chunk_hdr) + sizeof(struct sctp_chunkhdr)); /* Replace the cause code with the "Unrecognized parameter" @@ -518,7 +516,7 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(struct net *net, return sctp_sf_violation_chunk(net, ep, asoc, type, arg, commands); /* Make sure that the INIT-ACK chunk has a valid length */ - if (!sctp_chunk_length_valid(chunk, sizeof(sctp_initack_chunk_t))) + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_initack_chunk))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); /* Grab the INIT header. */ @@ -1090,7 +1088,8 @@ sctp_disposition_t sctp_sf_beat_8_3(struct net *net, return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); /* Make sure that the HEARTBEAT chunk has a valid length. */ - if (!sctp_chunk_length_valid(chunk, sizeof(sctp_heartbeat_chunk_t))) + if (!sctp_chunk_length_valid(chunk, + sizeof(struct sctp_heartbeat_chunk))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -1098,7 +1097,7 @@ sctp_disposition_t sctp_sf_beat_8_3(struct net *net, * respond with a HEARTBEAT ACK that contains the Heartbeat * Information field copied from the received HEARTBEAT chunk. */ - chunk->subh.hb_hdr = (sctp_heartbeathdr_t *)chunk->skb->data; + chunk->subh.hb_hdr = (struct sctp_heartbeathdr *)chunk->skb->data; param_hdr = (struct sctp_paramhdr *)chunk->subh.hb_hdr; paylen = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); @@ -1419,13 +1418,11 @@ static sctp_disposition_t sctp_sf_do_unexpected_init( const sctp_subtype_t type, void *arg, sctp_cmd_seq_t *commands) { - sctp_disposition_t retval; - struct sctp_chunk *chunk = arg; - struct sctp_chunk *repl; + struct sctp_chunk *chunk = arg, *repl, *err_chunk; + struct sctp_unrecognized_param *unk_param; struct sctp_association *new_asoc; - struct sctp_chunk *err_chunk; struct sctp_packet *packet; - sctp_unrecognized_param_t *unk_param; + sctp_disposition_t retval; int len; /* 6.10 Bundling @@ -1555,7 +1552,7 @@ static sctp_disposition_t sctp_sf_do_unexpected_init( * construct the parameters in INIT ACK by copying the * ERROR causes over. */ - unk_param = (sctp_unrecognized_param_t *) + unk_param = (struct sctp_unrecognized_param *) ((__u8 *)(err_chunk->chunk_hdr) + sizeof(struct sctp_chunkhdr)); /* Replace the cause code with the "Unrecognized parameter" @@ -2167,7 +2164,7 @@ sctp_disposition_t sctp_sf_shutdown_pending_abort( * as we do not know its true length. So, to be safe, discard the * packet. */ - if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk))) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); /* ADD-IP: Special case for ABORT chunks @@ -2209,7 +2206,7 @@ sctp_disposition_t sctp_sf_shutdown_sent_abort(struct net *net, * as we do not know its true length. So, to be safe, discard the * packet. */ - if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk))) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); /* ADD-IP: Special case for ABORT chunks @@ -2336,13 +2333,12 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net, void *arg, sctp_cmd_seq_t *commands) { - struct sctp_chunk *chunk = arg; - u32 stale; - sctp_cookie_preserve_param_t bht; - sctp_errhdr_t *err; - struct sctp_chunk *reply; - struct sctp_bind_addr *bp; int attempts = asoc->init_err_counter + 1; + struct sctp_chunk *chunk = arg, *reply; + struct sctp_cookie_preserve_param bht; + struct sctp_bind_addr *bp; + sctp_errhdr_t *err; + u32 stale; if (attempts > asoc->max_init_attempts) { sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, @@ -2474,7 +2470,7 @@ sctp_disposition_t sctp_sf_do_9_1_abort(struct net *net, * as we do not know its true length. So, to be safe, discard the * packet. */ - if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk))) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); /* ADD-IP: Special case for ABORT chunks @@ -2550,7 +2546,7 @@ sctp_disposition_t sctp_sf_cookie_wait_abort(struct net *net, * as we do not know its true length. So, to be safe, discard the * packet. */ - if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk))) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); /* See if we have an error cause code in the chunk. */ @@ -3192,14 +3188,14 @@ sctp_disposition_t sctp_sf_eat_sack_6_2(struct net *net, sctp_cmd_seq_t *commands) { struct sctp_chunk *chunk = arg; - sctp_sackhdr_t *sackh; + struct sctp_sackhdr *sackh; __u32 ctsn; if (!sctp_vtag_verify(chunk, asoc)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); /* Make sure that the SACK chunk has a valid length. */ - if (!sctp_chunk_length_valid(chunk, sizeof(sctp_sack_chunk_t))) + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_sack_chunk))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -4454,11 +4450,10 @@ static sctp_disposition_t sctp_sf_abort_violation( /* Treat INIT-ACK as a special case during COOKIE-WAIT. */ if (chunk->chunk_hdr->type == SCTP_CID_INIT_ACK && !asoc->peer.i.init_tag) { - sctp_initack_chunk_t *initack; + struct sctp_initack_chunk *initack; - initack = (sctp_initack_chunk_t *)chunk->chunk_hdr; - if (!sctp_chunk_length_valid(chunk, - sizeof(sctp_initack_chunk_t))) + initack = (struct sctp_initack_chunk *)chunk->chunk_hdr; + if (!sctp_chunk_length_valid(chunk, sizeof(*initack))) abort->chunk_hdr->flags |= SCTP_CHUNK_FLAG_T; else { unsigned int inittag; @@ -4521,7 +4516,7 @@ nomem: * Handle a protocol violation when the chunk length is invalid. * "Invalid" length is identified as smaller than the minimal length a * given chunk can be. For example, a SACK chunk has invalid length - * if its length is set to be smaller than the size of sctp_sack_chunk_t. + * if its length is set to be smaller than the size of struct sctp_sack_chunk. * * We inform the other end by sending an ABORT with a Protocol Violation * error code. @@ -6107,9 +6102,9 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net, switch (chunk->chunk_hdr->type) { case SCTP_CID_INIT_ACK: { - sctp_initack_chunk_t *initack; + struct sctp_initack_chunk *initack; - initack = (sctp_initack_chunk_t *)chunk->chunk_hdr; + initack = (struct sctp_initack_chunk *)chunk->chunk_hdr; vtag = ntohl(initack->init_hdr.init_tag); break; } diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 33954852f3f8..c717ef0896aa 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -8,10 +8,6 @@ config SMC The Linux implementation of the SMC-R solution is designed as a separate socket family SMC. - Warning: SMC will expose all memory for remote reads and writes - once a connection is established. Don't enable this option except - for tightly controlled lab environment. - Select this option if you want to run SMC socket applications config SMC_DIAG diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6793d7348cc8..8c6d24b2995d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -338,6 +338,12 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) return SMC_CLC_DECL_INTERR; smc_wr_remember_qp_attr(link); + + rc = smc_wr_reg_send(link, + smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); + if (rc) + return SMC_CLC_DECL_INTERR; + /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, link->smcibdev->mac[link->ibport - 1], @@ -430,12 +436,8 @@ static int smc_connect_rdma(struct smc_sock *smc) smc_conn_save_peer_info(smc, &aclc); - rc = smc_sndbuf_create(smc); - if (rc) { - reason_code = SMC_CLC_DECL_MEM; - goto decline_rdma_unlock; - } - rc = smc_rmb_create(smc); + /* create send buffer and rmb */ + rc = smc_buf_create(smc); if (rc) { reason_code = SMC_CLC_DECL_MEM; goto decline_rdma_unlock; @@ -459,7 +461,20 @@ static int smc_connect_rdma(struct smc_sock *smc) reason_code = SMC_CLC_DECL_INTERR; goto decline_rdma_unlock; } + } else { + struct smc_buf_desc *buf_desc = smc->conn.rmb_desc; + + if (!buf_desc->reused) { + /* register memory region for new rmb */ + rc = smc_wr_reg_send(link, + buf_desc->mr_rx[SMC_SINGLE_LINK]); + if (rc) { + reason_code = SMC_CLC_DECL_INTERR; + goto decline_rdma_unlock; + } + } } + smc_rmb_sync_sg_for_device(&smc->conn); rc = smc_clc_send_confirm(smc); if (rc) @@ -692,6 +707,12 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) int rc; link = &lgr->lnk[SMC_SINGLE_LINK]; + + rc = smc_wr_reg_send(link, + smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); + if (rc) + return SMC_CLC_DECL_INTERR; + /* send CONFIRM LINK request to client over the RoCE fabric */ rc = smc_llc_send_confirm_link(link, link->smcibdev->mac[link->ibport - 1], @@ -779,11 +800,6 @@ static void smc_listen_work(struct work_struct *work) mutex_lock(&smc_create_lgr_pending); local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, smcibdev, ibport, &pclc.lcl, 0); - if (local_contact == SMC_REUSE_CONTACT) - /* lock no longer needed, free it due to following - * smc_clc_wait_msg() call - */ - mutex_unlock(&smc_create_lgr_pending); if (local_contact < 0) { rc = local_contact; if (rc == -ENOMEM) @@ -794,12 +810,8 @@ static void smc_listen_work(struct work_struct *work) } link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; - rc = smc_sndbuf_create(new_smc); - if (rc) { - reason_code = SMC_CLC_DECL_MEM; - goto decline_rdma; - } - rc = smc_rmb_create(new_smc); + /* create send buffer and rmb */ + rc = smc_buf_create(new_smc); if (rc) { reason_code = SMC_CLC_DECL_MEM; goto decline_rdma; @@ -808,6 +820,21 @@ static void smc_listen_work(struct work_struct *work) smc_close_init(new_smc); smc_rx_init(new_smc); + if (local_contact != SMC_FIRST_CONTACT) { + struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc; + + if (!buf_desc->reused) { + /* register memory region for new rmb */ + rc = smc_wr_reg_send(link, + buf_desc->mr_rx[SMC_SINGLE_LINK]); + if (rc) { + reason_code = SMC_CLC_DECL_INTERR; + goto decline_rdma; + } + } + } + smc_rmb_sync_sg_for_device(&new_smc->conn); + rc = smc_clc_send_accept(new_smc, local_contact); if (rc) goto out_err; @@ -853,8 +880,7 @@ out_connected: if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE; enqueue: - if (local_contact == SMC_FIRST_CONTACT) - mutex_unlock(&smc_create_lgr_pending); + mutex_unlock(&smc_create_lgr_pending); lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); if (lsmc->sk.sk_state == SMC_LISTEN) { smc_accept_enqueue(&lsmc->sk, newsmcsk); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 03ec058d18df..3934913ab835 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -204,13 +204,13 @@ int smc_clc_send_confirm(struct smc_sock *smc) memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = - htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); cclc.rmbe_size = conn->rmbe_size_short; - cclc.rmb_dma_addr = - cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]); + cclc.rmb_dma_addr = cpu_to_be64( + (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); hton24(cclc.psn, link->psn_initial); memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -256,13 +256,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = - htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; aclc.rmbe_size = conn->rmbe_size_short, - aclc.rmb_dma_addr = - cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]); + aclc.rmb_dma_addr = cpu_to_be64( + (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); hton24(aclc.psn, link->psn_initial); memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3ac09a629ea1..1a16d51e2330 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -175,7 +175,6 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, rc = smc_wr_alloc_link_mem(lnk); if (rc) goto free_lgr; - init_waitqueue_head(&lnk->wr_tx_wait); rc = smc_ib_create_protection_domain(lnk); if (rc) goto free_link_mem; @@ -207,17 +206,14 @@ out: return rc; } -static void smc_sndbuf_unuse(struct smc_connection *conn) +static void smc_buf_unuse(struct smc_connection *conn) { if (conn->sndbuf_desc) { conn->sndbuf_desc->used = 0; conn->sndbuf_size = 0; } -} - -static void smc_rmb_unuse(struct smc_connection *conn) -{ if (conn->rmb_desc) { + conn->rmb_desc->reused = true; conn->rmb_desc->used = 0; conn->rmbe_size = 0; } @@ -232,8 +228,7 @@ void smc_conn_free(struct smc_connection *conn) return; smc_cdc_tx_dismiss_slots(conn); smc_lgr_unregister_conn(conn); - smc_rmb_unuse(conn); - smc_sndbuf_unuse(conn); + smc_buf_unuse(conn); } static void smc_link_clear(struct smc_link *lnk) @@ -246,48 +241,57 @@ static void smc_link_clear(struct smc_link *lnk) smc_wr_free_link_mem(lnk); } -static void smc_lgr_free_sndbufs(struct smc_link_group *lgr) +static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, + bool is_rmb) { - struct smc_buf_desc *sndbuf_desc, *bf_desc; - int i; - - for (i = 0; i < SMC_RMBE_SIZES; i++) { - list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i], - list) { - list_del(&sndbuf_desc->list); - smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - smc_uncompress_bufsize(i), - sndbuf_desc, DMA_TO_DEVICE); - kfree(sndbuf_desc->cpu_addr); - kfree(sndbuf_desc); - } + if (is_rmb) { + if (buf_desc->mr_rx[SMC_SINGLE_LINK]) + smc_ib_put_memory_region( + buf_desc->mr_rx[SMC_SINGLE_LINK]); + smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, + DMA_FROM_DEVICE); + } else { + smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, + DMA_TO_DEVICE); } + sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); + if (buf_desc->cpu_addr) + free_pages((unsigned long)buf_desc->cpu_addr, buf_desc->order); + kfree(buf_desc); } -static void smc_lgr_free_rmbs(struct smc_link_group *lgr) +static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { - struct smc_buf_desc *rmb_desc, *bf_desc; struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + struct smc_buf_desc *buf_desc, *bf_desc; + struct list_head *buf_list; int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { - list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i], + if (is_rmb) + buf_list = &lgr->rmbs[i]; + else + buf_list = &lgr->sndbufs[i]; + list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { - list_del(&rmb_desc->list); - smc_ib_buf_unmap(lnk->smcibdev, - smc_uncompress_bufsize(i), - rmb_desc, DMA_FROM_DEVICE); - kfree(rmb_desc->cpu_addr); - kfree(rmb_desc); + list_del(&buf_desc->list); + smc_buf_free(buf_desc, lnk, is_rmb); } } } +static void smc_lgr_free_bufs(struct smc_link_group *lgr) +{ + /* free send buffers */ + __smc_lgr_free_bufs(lgr, false); + /* free rmbs */ + __smc_lgr_free_bufs(lgr, true); +} + /* remove a link group */ void smc_lgr_free(struct smc_link_group *lgr) { - smc_lgr_free_rmbs(lgr); - smc_lgr_free_sndbufs(lgr); + smc_lgr_free_bufs(lgr); smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); kfree(lgr); } @@ -452,45 +456,25 @@ out: return rc ? rc : local_contact; } -/* try to reuse a sndbuf description slot of the sndbufs list for a certain - * buf_size; if not available, return NULL +/* try to reuse a sndbuf or rmb description slot for a certain + * buffer size; if not available, return NULL */ static inline -struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr, - int compressed_bufsize) +struct smc_buf_desc *smc_buf_get_slot(struct smc_link_group *lgr, + int compressed_bufsize, + rwlock_t *lock, + struct list_head *buf_list) { - struct smc_buf_desc *sndbuf_slot; - - read_lock_bh(&lgr->sndbufs_lock); - list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize], - list) { - if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) { - read_unlock_bh(&lgr->sndbufs_lock); - return sndbuf_slot; - } - } - read_unlock_bh(&lgr->sndbufs_lock); - return NULL; -} + struct smc_buf_desc *buf_slot; -/* try to reuse an rmb description slot of the rmbs list for a certain - * rmbe_size; if not available, return NULL - */ -static inline -struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr, - int compressed_bufsize) -{ - struct smc_buf_desc *rmb_slot; - - read_lock_bh(&lgr->rmbs_lock); - list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize], - list) { - if (cmpxchg(&rmb_slot->used, 0, 1) == 0) { - read_unlock_bh(&lgr->rmbs_lock); - return rmb_slot; + read_lock_bh(lock); + list_for_each_entry(buf_slot, buf_list, list) { + if (cmpxchg(&buf_slot->used, 0, 1) == 0) { + read_unlock_bh(lock); + return buf_slot; } } - read_unlock_bh(&lgr->rmbs_lock); + read_unlock_bh(lock); return NULL; } @@ -503,136 +487,186 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } -/* create the tx buffer for an SMC socket */ -int smc_sndbuf_create(struct smc_sock *smc) +static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, + bool is_rmb, int bufsize) { - struct smc_connection *conn = &smc->conn; - struct smc_link_group *lgr = conn->lgr; - int tmp_bufsize, tmp_bufsize_short; - struct smc_buf_desc *sndbuf_desc; + struct smc_buf_desc *buf_desc; + struct smc_link *lnk; int rc; - /* use socket send buffer size (w/o overhead) as start value */ - for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2); - tmp_bufsize_short >= 0; tmp_bufsize_short--) { - tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); - /* check for reusable sndbuf_slot in the link group */ - sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short); - if (sndbuf_desc) { - memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize); - break; /* found reusable slot */ - } - /* try to alloc a new send buffer */ - sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL); - if (!sndbuf_desc) - break; /* give up with -ENOMEM */ - sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize, - GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | - __GFP_NORETRY); - if (!sndbuf_desc->cpu_addr) { - kfree(sndbuf_desc); - sndbuf_desc = NULL; - /* if send buffer allocation has failed, - * try a smaller one - */ - continue; - } - rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - tmp_bufsize, sndbuf_desc, - DMA_TO_DEVICE); + /* try to alloc a new buffer */ + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return ERR_PTR(-ENOMEM); + + buf_desc->cpu_addr = + (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_ZERO, + get_order(bufsize)); + if (!buf_desc->cpu_addr) { + kfree(buf_desc); + return ERR_PTR(-EAGAIN); + } + buf_desc->order = get_order(bufsize); + + /* build the sg table from the pages */ + lnk = &lgr->lnk[SMC_SINGLE_LINK]; + rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, + GFP_KERNEL); + if (rc) { + smc_buf_free(buf_desc, lnk, is_rmb); + return ERR_PTR(rc); + } + sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, + buf_desc->cpu_addr, bufsize); + + /* map sg table to DMA address */ + rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + /* SMC protocol depends on mapping to one DMA address only */ + if (rc != 1) { + smc_buf_free(buf_desc, lnk, is_rmb); + return ERR_PTR(-EAGAIN); + } + + /* create a new memory region for the RMB */ + if (is_rmb) { + rc = smc_ib_get_memory_region(lnk->roce_pd, + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE, + buf_desc); if (rc) { - kfree(sndbuf_desc->cpu_addr); - kfree(sndbuf_desc); - sndbuf_desc = NULL; - continue; /* if mapping failed, try smaller one */ + smc_buf_free(buf_desc, lnk, is_rmb); + return ERR_PTR(rc); } - sndbuf_desc->used = 1; - write_lock_bh(&lgr->sndbufs_lock); - list_add(&sndbuf_desc->list, - &lgr->sndbufs[tmp_bufsize_short]); - write_unlock_bh(&lgr->sndbufs_lock); - break; - } - if (sndbuf_desc && sndbuf_desc->cpu_addr) { - conn->sndbuf_desc = sndbuf_desc; - conn->sndbuf_size = tmp_bufsize; - smc->sk.sk_sndbuf = tmp_bufsize * 2; - atomic_set(&conn->sndbuf_space, tmp_bufsize); - return 0; - } else { - return -ENOMEM; } + + return buf_desc; } -/* create the RMB for an SMC socket (even though the SMC protocol - * allows more than one RMB-element per RMB, the Linux implementation - * uses just one RMB-element per RMB, i.e. uses an extra RMB for every - * connection in a link group - */ -int smc_rmb_create(struct smc_sock *smc) +static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) { struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; - int tmp_bufsize, tmp_bufsize_short; - struct smc_buf_desc *rmb_desc; - int rc; + struct smc_buf_desc *buf_desc = NULL; + struct list_head *buf_list; + int bufsize, bufsize_short; + int sk_buf_size; + rwlock_t *lock; + + if (is_rmb) + /* use socket recv buffer size (w/o overhead) as start value */ + sk_buf_size = smc->sk.sk_rcvbuf / 2; + else + /* use socket send buffer size (w/o overhead) as start value */ + sk_buf_size = smc->sk.sk_sndbuf / 2; + + for (bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2); + bufsize_short >= 0; bufsize_short--) { + + if (is_rmb) { + lock = &lgr->rmbs_lock; + buf_list = &lgr->rmbs[bufsize_short]; + } else { + lock = &lgr->sndbufs_lock; + buf_list = &lgr->sndbufs[bufsize_short]; + } + bufsize = smc_uncompress_bufsize(bufsize_short); + if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) + continue; - /* use socket recv buffer size (w/o overhead) as start value */ - for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2); - tmp_bufsize_short >= 0; tmp_bufsize_short--) { - tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); - /* check for reusable rmb_slot in the link group */ - rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short); - if (rmb_desc) { - memset(rmb_desc->cpu_addr, 0, tmp_bufsize); + /* check for reusable slot in the link group */ + buf_desc = smc_buf_get_slot(lgr, bufsize_short, lock, buf_list); + if (buf_desc) { + memset(buf_desc->cpu_addr, 0, bufsize); break; /* found reusable slot */ } - /* try to alloc a new RMB */ - rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL); - if (!rmb_desc) - break; /* give up with -ENOMEM */ - rmb_desc->cpu_addr = kzalloc(tmp_bufsize, - GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | - __GFP_NORETRY); - if (!rmb_desc->cpu_addr) { - kfree(rmb_desc); - rmb_desc = NULL; - /* if RMB allocation has failed, - * try a smaller one - */ + + buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize); + if (PTR_ERR(buf_desc) == -ENOMEM) + break; + if (IS_ERR(buf_desc)) continue; - } - rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - tmp_bufsize, rmb_desc, - DMA_FROM_DEVICE); - if (rc) { - kfree(rmb_desc->cpu_addr); - kfree(rmb_desc); - rmb_desc = NULL; - continue; /* if mapping failed, try smaller one */ - } - rmb_desc->rkey[SMC_SINGLE_LINK] = - lgr->lnk[SMC_SINGLE_LINK].roce_pd->unsafe_global_rkey; - rmb_desc->used = 1; - write_lock_bh(&lgr->rmbs_lock); - list_add(&rmb_desc->list, - &lgr->rmbs[tmp_bufsize_short]); - write_unlock_bh(&lgr->rmbs_lock); - break; + + buf_desc->used = 1; + write_lock_bh(lock); + list_add(&buf_desc->list, buf_list); + write_unlock_bh(lock); + break; /* found */ } - if (rmb_desc && rmb_desc->cpu_addr) { - conn->rmb_desc = rmb_desc; - conn->rmbe_size = tmp_bufsize; - conn->rmbe_size_short = tmp_bufsize_short; - smc->sk.sk_rcvbuf = tmp_bufsize * 2; + + if (IS_ERR(buf_desc)) + return -ENOMEM; + + if (is_rmb) { + conn->rmb_desc = buf_desc; + conn->rmbe_size = bufsize; + conn->rmbe_size_short = bufsize_short; + smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); - conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize); - return 0; + conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); } else { - return -ENOMEM; + conn->sndbuf_desc = buf_desc; + conn->sndbuf_size = bufsize; + smc->sk.sk_sndbuf = bufsize * 2; + atomic_set(&conn->sndbuf_space, bufsize); } + return 0; +} + +void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + + smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + conn->sndbuf_desc, DMA_TO_DEVICE); +} + +void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + + smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + conn->sndbuf_desc, DMA_TO_DEVICE); +} + +void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + + smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + conn->rmb_desc, DMA_FROM_DEVICE); +} + +void smc_rmb_sync_sg_for_device(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + + smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + conn->rmb_desc, DMA_FROM_DEVICE); +} + +/* create the send and receive buffer for an SMC socket; + * receive buffers are called RMBs; + * (even though the SMC protocol allows more than one RMB-element per RMB, + * the Linux implementation uses just one RMB-element per RMB, i.e. uses an + * extra RMB for every connection in a link group + */ +int smc_buf_create(struct smc_sock *smc) +{ + int rc; + + /* create send buffer */ + rc = __smc_buf_create(smc, false); + if (rc) + return rc; + /* create rmb */ + rc = __smc_buf_create(smc, true); + if (rc) + smc_buf_free(smc->conn.sndbuf_desc, + &smc->conn.lgr->lnk[SMC_SINGLE_LINK], false); + return rc; } static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index b013cb43a327..19c44bf4e391 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -37,6 +37,14 @@ struct smc_wr_buf { u8 raw[SMC_WR_BUF_SIZE]; }; +#define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */ + +enum smc_wr_reg_state { + POSTED, /* ib_wr_reg_mr request posted */ + CONFIRMED, /* ib_wr_reg_mr response: successful */ + FAILED /* ib_wr_reg_mr response: failure */ +}; + struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ @@ -65,6 +73,10 @@ struct smc_link { u64 wr_rx_id; /* seq # of last recv WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ + struct ib_reg_wr wr_reg; /* WR register memory region */ + wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ + enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ + union ib_gid gid; /* gid matching used vlan id */ u32 peer_qpn; /* QP number of peer */ enum ib_mtu path_mtu; /* used mtu */ @@ -90,14 +102,15 @@ struct smc_link { /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ struct smc_buf_desc { struct list_head list; - u64 dma_addr[SMC_LINKS_PER_LGR_MAX]; - /* mapped address of buffer */ void *cpu_addr; /* virtual address of buffer */ - u32 rkey[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: - * rkey provided to peer + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: memory region + * incl. rkey provided to peer */ + u32 order; /* allocation order */ u32 used; /* currently used / unused */ + bool reused; /* new created / reused */ }; struct smc_rtoken { /* address/key of remote RMB */ @@ -173,9 +186,11 @@ struct smc_clc_msg_accept_confirm; void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); -int smc_sndbuf_create(struct smc_sock *smc); -int smc_rmb_create(struct smc_sock *smc); +int smc_buf_create(struct smc_sock *smc); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); - +void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); +void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); +void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); +void smc_rmb_sync_sg_for_device(struct smc_connection *conn); #endif diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index b31715505a35..547e0e113b17 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -13,6 +13,7 @@ #include <linux/random.h> #include <linux/workqueue.h> +#include <linux/scatterlist.h> #include <rdma/ib_verbs.h> #include "smc_pnet.h" @@ -192,8 +193,7 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) { int rc; - lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, - IB_PD_UNSAFE_GLOBAL_RKEY); + lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); rc = PTR_ERR_OR_ZERO(lnk->roce_pd); if (IS_ERR(lnk->roce_pd)) lnk->roce_pd = NULL; @@ -232,10 +232,10 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .recv_cq = lnk->smcibdev->roce_cq_recv, .srq = NULL, .cap = { - .max_send_wr = SMC_WR_BUF_CNT, /* include unsolicited rdma_writes as well, * there are max. 2 RDMA_WRITE per 1 WR_SEND */ + .max_send_wr = SMC_WR_BUF_CNT * 3, .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = 1, @@ -254,33 +254,117 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) return rc; } -/* map a new TX or RX buffer to DMA */ -int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, - struct smc_buf_desc *buf_slot, - enum dma_data_direction data_direction) +void smc_ib_put_memory_region(struct ib_mr *mr) { - int rc = 0; + ib_dereg_mr(mr); +} - if (buf_slot->dma_addr[SMC_SINGLE_LINK]) - return rc; /* already mapped */ - buf_slot->dma_addr[SMC_SINGLE_LINK] = - ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr, - buf_size, data_direction); - if (ib_dma_mapping_error(smcibdev->ibdev, - buf_slot->dma_addr[SMC_SINGLE_LINK])) - rc = -EIO; - return rc; +static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) +{ + unsigned int offset = 0; + int sg_num; + + /* map the largest prefix of a dma mapped SG list */ + sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK], + buf_slot->sgt[SMC_SINGLE_LINK].sgl, + buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + &offset, PAGE_SIZE); + + return sg_num; +} + +/* Allocate a memory region and map the dma mapped SG list of buf_slot */ +int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, + struct smc_buf_desc *buf_slot) +{ + if (buf_slot->mr_rx[SMC_SINGLE_LINK]) + return 0; /* already done */ + + buf_slot->mr_rx[SMC_SINGLE_LINK] = + ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); + if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) { + int rc; + + rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]); + buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL; + return rc; + } + + if (smc_ib_map_mr_sg(buf_slot) != 1) + return -EINVAL; + + return 0; } -void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size, +/* synchronize buffer usage for cpu access */ +void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + struct scatterlist *sg; + unsigned int i; + + /* for now there is just one DMA address */ + for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, + buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + if (!sg_dma_len(sg)) + break; + ib_dma_sync_single_for_cpu(smcibdev->ibdev, + sg_dma_address(sg), + sg_dma_len(sg), + data_direction); + } +} + +/* synchronize buffer usage for device access */ +void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + struct scatterlist *sg; + unsigned int i; + + /* for now there is just one DMA address */ + for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, + buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + if (!sg_dma_len(sg)) + break; + ib_dma_sync_single_for_device(smcibdev->ibdev, + sg_dma_address(sg), + sg_dma_len(sg), + data_direction); + } +} + +/* Map a new TX or RX buffer SG-table to DMA */ +int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { - if (!buf_slot->dma_addr[SMC_SINGLE_LINK]) + int mapped_nents; + + mapped_nents = ib_dma_map_sg(smcibdev->ibdev, + buf_slot->sgt[SMC_SINGLE_LINK].sgl, + buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + data_direction); + if (!mapped_nents) + return -ENOMEM; + + return mapped_nents; +} + +void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address) return; /* already unmapped */ - ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size, - data_direction); - buf_slot->dma_addr[SMC_SINGLE_LINK] = 0; + + ib_dma_unmap_sg(smcibdev->ibdev, + buf_slot->sgt[SMC_SINGLE_LINK].sgl, + buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + data_direction); + buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0; } static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index b567152a526d..9b927a33d5e6 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -51,12 +51,12 @@ int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport); -int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, - struct smc_buf_desc *buf_slot, - enum dma_data_direction data_direction); -void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int bufsize, +int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); +void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); void smc_ib_dealloc_protection_domain(struct smc_link *lnk); int smc_ib_create_protection_domain(struct smc_link *lnk); void smc_ib_destroy_queue_pair(struct smc_link *lnk); @@ -65,6 +65,13 @@ int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); - - +int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, + struct smc_buf_desc *buf_slot); +void smc_ib_put_memory_region(struct ib_mr *mr); +void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); +void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); #endif diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index f0c8b089f770..b17a333e9bb0 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -170,6 +170,7 @@ copy: copylen, conn->rmbe_size - cons.count); chunk_len_sum = chunk_len; chunk_off = cons.count; + smc_rmb_sync_sg_for_cpu(conn); for (chunk = 0; chunk < 2; chunk++) { if (!(flags & MSG_TRUNC)) { rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off, @@ -177,6 +178,7 @@ copy: if (rc) { if (!read_done) read_done = -EFAULT; + smc_rmb_sync_sg_for_device(conn); goto out; } } @@ -190,6 +192,7 @@ copy: chunk_len_sum += chunk_len; chunk_off = 0; /* modulo offset in recv ring buffer */ } + smc_rmb_sync_sg_for_device(conn); /* update cursors */ if (!(flags & MSG_PEEK)) { diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 21ec1832ab51..3c656beb8820 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -174,10 +174,12 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) copylen, conn->sndbuf_size - tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; + smc_sndbuf_sync_sg_for_cpu(conn); for (chunk = 0; chunk < 2; chunk++) { rc = memcpy_from_msg(sndbuf_base + chunk_off, msg, chunk_len); if (rc) { + smc_sndbuf_sync_sg_for_device(conn); if (send_done) return send_done; goto out_err; @@ -192,6 +194,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) chunk_len_sum += chunk_len; chunk_off = 0; /* modulo offset in send ring buffer */ } + smc_sndbuf_sync_sg_for_device(conn); /* update cursors */ smc_curs_add(conn->sndbuf_size, &prep, copylen); smc_curs_write(&conn->tx_curs_prep, @@ -277,6 +280,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) struct smc_link_group *lgr = conn->lgr; int to_send, rmbespace; struct smc_link *link; + dma_addr_t dma_addr; int num_sges; int rc; @@ -334,12 +338,11 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) src_len = conn->sndbuf_size - sent.count; } src_len_sum = src_len; + dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); for (dstchunk = 0; dstchunk < 2; dstchunk++) { num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sges[srcchunk].addr = - conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] + - src_off; + sges[srcchunk].addr = dma_addr + src_off; sges[srcchunk].length = src_len; sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; num_sges++; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 874ee9f9d796..ab56bda66783 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -68,6 +68,16 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) int i; link = wc->qp->qp_context; + + if (wc->opcode == IB_WC_REG_MR) { + if (wc->status) + link->wr_reg_state = FAILED; + else + link->wr_reg_state = CONFIRMED; + wake_up(&link->wr_reg_wait); + return; + } + pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); if (pnd_snd_idx == link->wr_tx_cnt) return; @@ -243,6 +253,52 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) return rc; } +/* Register a memory region and wait for result. */ +int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) +{ + struct ib_send_wr *failed_wr = NULL; + int rc; + + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + link->wr_reg_state = POSTED; + link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; + link->wr_reg.mr = mr; + link->wr_reg.key = mr->rkey; + failed_wr = &link->wr_reg.wr; + rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, &failed_wr); + WARN_ON(failed_wr != &link->wr_reg.wr); + if (rc) + return rc; + + rc = wait_event_interruptible_timeout(link->wr_reg_wait, + (link->wr_reg_state != POSTED), + SMC_WR_REG_MR_WAIT_TIME); + if (!rc) { + /* timeout - terminate connections */ + struct smc_link_group *lgr; + + lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + smc_lgr_terminate(lgr); + return -EPIPE; + } + if (rc == -ERESTARTSYS) + return -EINTR; + switch (link->wr_reg_state) { + case CONFIRMED: + rc = 0; + break; + case FAILED: + rc = -EIO; + break; + case POSTED: + rc = -EPIPE; + break; + } + return rc; +} + void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type, smc_wr_tx_filter filter, smc_wr_tx_dismisser dismisser, @@ -458,6 +514,11 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i]; lnk->wr_rx_ibs[i].num_sge = 1; } + lnk->wr_reg.wr.next = NULL; + lnk->wr_reg.wr.num_sge = 0; + lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED; + lnk->wr_reg.wr.opcode = IB_WR_REG_MR; + lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE; } void smc_wr_free_link(struct smc_link *lnk) @@ -602,6 +663,8 @@ int smc_wr_create_link(struct smc_link *lnk) smc_wr_init_sge(lnk); memset(lnk->wr_tx_mask, 0, BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + init_waitqueue_head(&lnk->wr_tx_wait); + init_waitqueue_head(&lnk->wr_reg_wait); return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 0b9beeda6053..45eb53833052 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -102,5 +102,6 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context); +int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr); #endif /* SMC_WR_H */ diff --git a/net/socket.c b/net/socket.c index ad22df1ffbd1..cb0fdf799f40 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3405,7 +3405,6 @@ u32 kernel_sock_ip_overhead(struct sock *sk) struct inet_sock *inet; struct ip_options_rcu *opt; u32 overhead = 0; - bool owned_by_user; #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *np; struct ipv6_txoptions *optv6 = NULL; @@ -3414,13 +3413,12 @@ u32 kernel_sock_ip_overhead(struct sock *sk) if (!sk) return overhead; - owned_by_user = sock_owned_by_user(sk); switch (sk->sk_family) { case AF_INET: inet = inet_sk(sk); overhead += sizeof(struct iphdr); opt = rcu_dereference_protected(inet->inet_opt, - owned_by_user); + sock_owned_by_user(sk)); if (opt) overhead += opt->opt.optlen; return overhead; @@ -3430,7 +3428,7 @@ u32 kernel_sock_ip_overhead(struct sock *sk) overhead += sizeof(struct ipv6hdr); if (np) optv6 = rcu_dereference_protected(np->opt, - owned_by_user); + sock_owned_by_user(sk)); if (optv6) overhead += (optv6->opt_flen + optv6->opt_nflen); return overhead; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7b52a380d710..5c53f22d62e8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1528,26 +1528,13 @@ static inline bool too_many_unix_fds(struct task_struct *p) return false; } -#define MAX_RECURSION_LEVEL 4 - static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; - unsigned char max_level = 0; if (too_many_unix_fds(current)) return -ETOOMANYREFS; - for (i = scm->fp->count - 1; i >= 0; i--) { - struct sock *sk = unix_get_socket(scm->fp->fp[i]); - - if (sk) - max_level = max(max_level, - unix_sk(sk)->recursion_level); - } - if (unlikely(max_level > MAX_RECURSION_LEVEL)) - return -ETOOMANYREFS; - /* * Need to duplicate file references for the sake of garbage * collection. Otherwise a socket in the fps might become a @@ -1559,7 +1546,7 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) for (i = scm->fp->count - 1; i >= 0; i--) unix_inflight(scm->fp->user, scm->fp->fp[i]); - return max_level; + return 0; } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) @@ -1649,7 +1636,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, struct sk_buff *skb; long timeo; struct scm_cookie scm; - int max_level; int data_len = 0; int sk_locked; @@ -1701,7 +1687,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = unix_scm_to_skb(&scm, skb, true); if (err < 0) goto out_free; - max_level = err + 1; skb_put(skb, len - data_len); skb->data_len = data_len; @@ -1819,8 +1804,6 @@ restart_locked: __net_timestamp(skb); maybe_add_creds(skb, sock, other); skb_queue_tail(&other->sk_receive_queue, skb); - if (max_level > unix_sk(other)->recursion_level) - unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other); sock_put(other); @@ -1855,7 +1838,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int sent = 0; struct scm_cookie scm; bool fds_sent = false; - int max_level; int data_len; wait_for_unix_gc(); @@ -1905,7 +1887,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, kfree_skb(skb); goto out_err; } - max_level = err + 1; fds_sent = true; skb_put(skb, size - data_len); @@ -1925,8 +1906,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, maybe_add_creds(skb, sock, other); skb_queue_tail(&other->sk_receive_queue, skb); - if (max_level > unix_sk(other)->recursion_level) - unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other); sent += size; @@ -2324,7 +2303,6 @@ redo: last_len = last ? last->len : 0; again: if (skb == NULL) { - unix_sk(sk)->recursion_level = 0; if (copied >= target) goto unlock; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 5f7e8bfa0c2d..5cd7a244e88d 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -153,6 +153,7 @@ static int xfrm_dev_register(struct net_device *dev) static int xfrm_dev_unregister(struct net_device *dev) { + xfrm_policy_cache_flush(); return NOTIFY_DONE; } @@ -175,8 +176,7 @@ static int xfrm_dev_down(struct net_device *dev) if (dev->features & NETIF_F_HW_ESP) xfrm_dev_state_flush(dev_net(dev), dev, true); - xfrm_garbage_collect(dev_net(dev)); - + xfrm_policy_cache_flush(); return NOTIFY_DONE; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ff61d8557929..06c3bf7ab86b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -24,6 +24,7 @@ #include <linux/netfilter.h> #include <linux/module.h> #include <linux/cache.h> +#include <linux/cpu.h> #include <linux/audit.h> #include <net/dst.h> #include <net/flow.h> @@ -44,6 +45,8 @@ struct xfrm_flo { u8 flags; }; +static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst); +static struct work_struct *xfrm_pcpu_work __read_mostly; static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; @@ -246,36 +249,6 @@ expired: xfrm_pol_put(xp); } -static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo) -{ - struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo); - - if (unlikely(pol->walk.dead)) - flo = NULL; - else - xfrm_pol_hold(pol); - - return flo; -} - -static int xfrm_policy_flo_check(struct flow_cache_object *flo) -{ - struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo); - - return !pol->walk.dead; -} - -static void xfrm_policy_flo_delete(struct flow_cache_object *flo) -{ - xfrm_pol_put(container_of(flo, struct xfrm_policy, flo)); -} - -static const struct flow_cache_ops xfrm_policy_fc_ops = { - .get = xfrm_policy_flo_get, - .check = xfrm_policy_flo_check, - .delete = xfrm_policy_flo_delete, -}; - /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 * SPD calls. */ @@ -298,7 +271,6 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) (unsigned long)policy); setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process, (unsigned long)policy); - policy->flo.ops = &xfrm_policy_fc_ops; } return policy; } @@ -798,7 +770,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) else hlist_add_head(&policy->bydst, chain); __xfrm_policy_link(policy, dir); - atomic_inc(&net->xfrm.flow_cache_genid); /* After previous checking, family can either be AF_INET or AF_INET6 */ if (policy->family == AF_INET) @@ -1004,6 +975,8 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) } if (!cnt) err = -ESRCH; + else + xfrm_policy_cache_flush(); out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; @@ -1175,7 +1148,7 @@ fail: } static struct xfrm_policy * -__xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir) +xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; @@ -1187,61 +1160,6 @@ __xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); } -static int flow_to_policy_dir(int dir) -{ - if (XFRM_POLICY_IN == FLOW_DIR_IN && - XFRM_POLICY_OUT == FLOW_DIR_OUT && - XFRM_POLICY_FWD == FLOW_DIR_FWD) - return dir; - - switch (dir) { - default: - case FLOW_DIR_IN: - return XFRM_POLICY_IN; - case FLOW_DIR_OUT: - return XFRM_POLICY_OUT; - case FLOW_DIR_FWD: - return XFRM_POLICY_FWD; - } -} - -static struct flow_cache_object * -xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, - u8 dir, struct flow_cache_object *old_obj, void *ctx) -{ - struct xfrm_policy *pol; - - if (old_obj) - xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo)); - - pol = __xfrm_policy_lookup(net, fl, family, flow_to_policy_dir(dir)); - if (IS_ERR_OR_NULL(pol)) - return ERR_CAST(pol); - - /* Resolver returns two references: - * one for cache and one for caller of flow_cache_lookup() */ - xfrm_pol_hold(pol); - - return &pol->flo; -} - -static inline int policy_to_flow_dir(int dir) -{ - if (XFRM_POLICY_IN == FLOW_DIR_IN && - XFRM_POLICY_OUT == FLOW_DIR_OUT && - XFRM_POLICY_FWD == FLOW_DIR_FWD) - return dir; - switch (dir) { - default: - case XFRM_POLICY_IN: - return FLOW_DIR_IN; - case XFRM_POLICY_OUT: - return FLOW_DIR_OUT; - case XFRM_POLICY_FWD: - return FLOW_DIR_FWD; - } -} - static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl, u16 family) { @@ -1261,7 +1179,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, } err = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, - policy_to_flow_dir(dir)); + dir); if (!err) { if (!xfrm_pol_hold_rcu(pol)) goto again; @@ -1545,58 +1463,6 @@ static int xfrm_get_tos(const struct flowi *fl, int family) return tos; } -static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo) -{ - struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); - struct dst_entry *dst = &xdst->u.dst; - - if (xdst->route == NULL) { - /* Dummy bundle - if it has xfrms we were not - * able to build bundle as template resolution failed. - * It means we need to try again resolving. */ - if (xdst->num_xfrms > 0) - return NULL; - } else if (dst->flags & DST_XFRM_QUEUE) { - return NULL; - } else { - /* Real bundle */ - if (stale_bundle(dst)) - return NULL; - } - - dst_hold(dst); - return flo; -} - -static int xfrm_bundle_flo_check(struct flow_cache_object *flo) -{ - struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); - struct dst_entry *dst = &xdst->u.dst; - - if (!xdst->route) - return 0; - if (stale_bundle(dst)) - return 0; - - return 1; -} - -static void xfrm_bundle_flo_delete(struct flow_cache_object *flo) -{ - struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); - struct dst_entry *dst = &xdst->u.dst; - - /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ - dst->obsolete = DST_OBSOLETE_DEAD; - dst_release_immediate(dst); -} - -static const struct flow_cache_ops xfrm_bundle_fc_ops = { - .get = xfrm_bundle_flo_get, - .check = xfrm_bundle_flo_check, - .delete = xfrm_bundle_flo_delete, -}; - static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); @@ -1624,7 +1490,6 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) struct dst_entry *dst = &xdst->u.dst; memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); - xdst->flo.ops = &xfrm_bundle_fc_ops; } else xdst = ERR_PTR(-ENOBUFS); @@ -1840,6 +1705,102 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, } +static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old) +{ + this_cpu_write(xfrm_last_dst, xdst); + if (old) + dst_release(&old->u.dst); +} + +static void __xfrm_pcpu_work_fn(void) +{ + struct xfrm_dst *old; + + old = this_cpu_read(xfrm_last_dst); + if (old && !xfrm_bundle_ok(old)) + xfrm_last_dst_update(NULL, old); +} + +static void xfrm_pcpu_work_fn(struct work_struct *work) +{ + local_bh_disable(); + rcu_read_lock(); + __xfrm_pcpu_work_fn(); + rcu_read_unlock(); + local_bh_enable(); +} + +void xfrm_policy_cache_flush(void) +{ + struct xfrm_dst *old; + bool found = 0; + int cpu; + + local_bh_disable(); + rcu_read_lock(); + for_each_possible_cpu(cpu) { + old = per_cpu(xfrm_last_dst, cpu); + if (old && !xfrm_bundle_ok(old)) { + if (smp_processor_id() == cpu) { + __xfrm_pcpu_work_fn(); + continue; + } + found = true; + break; + } + } + + rcu_read_unlock(); + local_bh_enable(); + + if (!found) + return; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + bool bundle_release; + + rcu_read_lock(); + old = per_cpu(xfrm_last_dst, cpu); + bundle_release = old && !xfrm_bundle_ok(old); + rcu_read_unlock(); + + if (!bundle_release) + continue; + + if (cpu_online(cpu)) { + schedule_work_on(cpu, &xfrm_pcpu_work[cpu]); + continue; + } + + rcu_read_lock(); + old = per_cpu(xfrm_last_dst, cpu); + if (old && !xfrm_bundle_ok(old)) { + per_cpu(xfrm_last_dst, cpu) = NULL; + dst_release(&old->u.dst); + } + rcu_read_unlock(); + } + + put_online_cpus(); +} + +static bool xfrm_pol_dead(struct xfrm_dst *xdst) +{ + unsigned int num_pols = xdst->num_pols; + unsigned int pol_dead = 0, i; + + for (i = 0; i < num_pols; i++) + pol_dead |= xdst->pols[i]->walk.dead; + + /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ + if (pol_dead) + xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; + + return pol_dead; +} + static struct xfrm_dst * xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, const struct flowi *fl, u16 family, @@ -1847,10 +1808,22 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, { struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; + struct xfrm_dst *xdst, *old; struct dst_entry *dst; - struct xfrm_dst *xdst; int err; + xdst = this_cpu_read(xfrm_last_dst); + if (xdst && + xdst->u.dst.dev == dst_orig->dev && + xdst->num_pols == num_pols && + !xfrm_pol_dead(xdst) && + memcmp(xdst->pols, pols, + sizeof(struct xfrm_policy *) * num_pols) == 0) { + dst_hold(&xdst->u.dst); + return xdst; + } + + old = xdst; /* Try to instantiate a bundle */ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); if (err <= 0) { @@ -1871,6 +1844,9 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); + atomic_set(&xdst->u.dst.__refcnt, 2); + xfrm_last_dst_update(xdst, old); + return xdst; } @@ -2051,86 +2027,39 @@ free_dst: goto out; } -static struct flow_cache_object * -xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, - struct flow_cache_object *oldflo, void *ctx) +static struct xfrm_dst * +xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo) { - struct xfrm_flo *xflo = (struct xfrm_flo *)ctx; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; - struct xfrm_dst *xdst, *new_xdst; - int num_pols = 0, num_xfrms = 0, i, err, pol_dead; - - /* Check if the policies from old bundle are usable */ - xdst = NULL; - if (oldflo) { - xdst = container_of(oldflo, struct xfrm_dst, flo); - num_pols = xdst->num_pols; - num_xfrms = xdst->num_xfrms; - pol_dead = 0; - for (i = 0; i < num_pols; i++) { - pols[i] = xdst->pols[i]; - pol_dead |= pols[i]->walk.dead; - } - if (pol_dead) { - /* Mark DST_OBSOLETE_DEAD to fail the next - * xfrm_dst_check() - */ - xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; - dst_release_immediate(&xdst->u.dst); - xdst = NULL; - num_pols = 0; - num_xfrms = 0; - oldflo = NULL; - } - } + int num_pols = 0, num_xfrms = 0, err; + struct xfrm_dst *xdst; /* Resolve policies to use if we couldn't get them from * previous cache entry */ - if (xdst == NULL) { - num_pols = 1; - pols[0] = __xfrm_policy_lookup(net, fl, family, - flow_to_policy_dir(dir)); - err = xfrm_expand_policies(fl, family, pols, + num_pols = 1; + pols[0] = xfrm_policy_lookup(net, fl, family, dir); + err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); - if (err < 0) - goto inc_error; - if (num_pols == 0) - return NULL; - if (num_xfrms <= 0) - goto make_dummy_bundle; - } + if (err < 0) + goto inc_error; + if (num_pols == 0) + return NULL; + if (num_xfrms <= 0) + goto make_dummy_bundle; - new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, + xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, xflo->dst_orig); - if (IS_ERR(new_xdst)) { - err = PTR_ERR(new_xdst); + if (IS_ERR(xdst)) { + err = PTR_ERR(xdst); if (err != -EAGAIN) goto error; - if (oldflo == NULL) - goto make_dummy_bundle; - dst_hold(&xdst->u.dst); - return oldflo; - } else if (new_xdst == NULL) { + goto make_dummy_bundle; + } else if (xdst == NULL) { num_xfrms = 0; - if (oldflo == NULL) - goto make_dummy_bundle; - xdst->num_xfrms = 0; - dst_hold(&xdst->u.dst); - return oldflo; - } - - /* Kill the previous bundle */ - if (xdst) { - /* The policies were stolen for newly generated bundle */ - xdst->num_pols = 0; - /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ - xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; - dst_release_immediate(&xdst->u.dst); + goto make_dummy_bundle; } - /* We do need to return one reference for original caller */ - dst_hold(&new_xdst->u.dst); - return &new_xdst->flo; + return xdst; make_dummy_bundle: /* We found policies, but there's no bundles to instantiate: @@ -2146,17 +2075,12 @@ make_dummy_bundle: memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); dst_hold(&xdst->u.dst); - return &xdst->flo; + return xdst; inc_error: XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); error: - if (xdst != NULL) { - /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */ - xdst->u.dst.obsolete = DST_OBSOLETE_DEAD; - dst_release_immediate(&xdst->u.dst); - } else - xfrm_pols_put(pols, num_pols); + xfrm_pols_put(pols, num_pols); return ERR_PTR(err); } @@ -2187,11 +2111,10 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct sock *sk, int flags) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; - struct flow_cache_object *flo; struct xfrm_dst *xdst; struct dst_entry *dst, *route; u16 family = dst_orig->ops->family; - u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); + u8 dir = XFRM_POLICY_OUT; int i, err, num_pols, num_xfrms = 0, drop_pols = 0; dst = NULL; @@ -2242,15 +2165,13 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, !net->xfrm.policy_count[XFRM_POLICY_OUT]) goto nopol; - flo = flow_cache_lookup(net, fl, family, dir, - xfrm_bundle_lookup, &xflo); - if (flo == NULL) + xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo); + if (xdst == NULL) goto nopol; - if (IS_ERR(flo)) { - err = PTR_ERR(flo); + if (IS_ERR(xdst)) { + err = PTR_ERR(xdst); goto dropdst; } - xdst = container_of(flo, struct xfrm_dst, flo); num_pols = xdst->num_pols; num_xfrms = xdst->num_xfrms; @@ -2449,12 +2370,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, int pi; int reverse; struct flowi fl; - u8 fl_dir; int xerr_idx = -1; reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; - fl_dir = policy_to_flow_dir(dir); if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); @@ -2486,16 +2405,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } } - if (!pol) { - struct flow_cache_object *flo; - - flo = flow_cache_lookup(net, &fl, family, fl_dir, - xfrm_policy_lookup, NULL); - if (IS_ERR_OR_NULL(flo)) - pol = ERR_CAST(flo); - else - pol = container_of(flo, struct xfrm_policy, flo); - } + if (!pol) + pol = xfrm_policy_lookup(net, &fl, family, dir); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); @@ -2641,11 +2552,9 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) * notice. That's what we are validating here via the * stale_bundle() check. * - * When an xdst is removed from flow cache, DST_OBSOLETE_DEAD will - * be marked on it. * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will * be marked on it. - * Both will force stable_bundle() to fail on any xdst bundle with + * This will force stale_bundle() to fail on any xdst bundle with * this dst linked in it. */ if (dst->obsolete < 0 && !stale_bundle(dst)) @@ -2685,18 +2594,6 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) return dst; } -void xfrm_garbage_collect(struct net *net) -{ - flow_cache_flush(net); -} -EXPORT_SYMBOL(xfrm_garbage_collect); - -void xfrm_garbage_collect_deferred(struct net *net) -{ - flow_cache_flush_deferred(net); -} -EXPORT_SYMBOL(xfrm_garbage_collect_deferred); - static void xfrm_init_pmtu(struct dst_entry *dst) { do { @@ -3034,14 +2931,9 @@ static int __net_init xfrm_net_init(struct net *net) rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; - rv = flow_cache_init(net); - if (rv < 0) - goto out; return 0; -out: - xfrm_sysctl_fini(net); out_sysctl: xfrm_policy_fini(net); out_policy: @@ -3054,7 +2946,6 @@ out_statistics: static void __net_exit xfrm_net_exit(struct net *net) { - flow_cache_fini(net); xfrm_sysctl_fini(net); xfrm_policy_fini(net); xfrm_state_fini(net); @@ -3068,7 +2959,15 @@ static struct pernet_operations __net_initdata xfrm_net_ops = { void __init xfrm_init(void) { - flow_cache_hp_init(); + int i; + + xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work), + GFP_KERNEL); + BUG_ON(!xfrm_pcpu_work); + + for (i = 0; i < NR_CPUS; i++) + INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn); + register_pernet_subsys(&xfrm_net_ops); seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 6c0956d10db6..82cbbce69b79 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -724,9 +724,10 @@ restart: } } } - if (cnt) + if (cnt) { err = 0; - + xfrm_policy_cache_flush(); + } out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); return err; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 2be4c6af008a..1b539b7dcfab 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1815,8 +1815,6 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, out: xfrm_pol_put(xp); - if (delete && err == 0) - xfrm_garbage_collect(net); return err; } @@ -2027,7 +2025,6 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; return err; } - xfrm_garbage_collect(net); c.data.type = type; c.event = nlh->nlmsg_type; |
