diff options
Diffstat (limited to 'net')
122 files changed, 4078 insertions, 1626 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 5505ee6ebdbe..73a65789271b 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -118,17 +118,21 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) } int vlan_check_real_dev(struct net_device *real_dev, - __be16 protocol, u16 vlan_id) + __be16 protocol, u16 vlan_id, + struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { pr_info("VLANs not supported on %s\n", name); + NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } - if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) + if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { + NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; + } return 0; } @@ -215,7 +219,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) if (vlan_id >= VLAN_VID_MASK) return -ERANGE; - err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id); + err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, + NULL); if (err < 0) return err; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index e23aac3e4d37..44df1c3df02d 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -109,7 +109,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); int vlan_check_real_dev(struct net_device *real_dev, - __be16 protocol, u16 vlan_id); + __be16 protocol, u16 vlan_id, + struct netlink_ext_ack *extack); void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 6689c0b272a7..9b60c1e399e2 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -47,14 +47,20 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], int err; if (tb[IFLA_ADDRESS]) { - if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { + NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EINVAL; - if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + } + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EADDRNOTAVAIL; + } } - if (!data) + if (!data) { + NL_SET_ERR_MSG_MOD(extack, "VLAN properties not specified"); return -EINVAL; + } if (data[IFLA_VLAN_PROTOCOL]) { switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) { @@ -62,29 +68,38 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], case htons(ETH_P_8021AD): break; default: + NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN protocol"); return -EPROTONOSUPPORT; } } if (data[IFLA_VLAN_ID]) { id = nla_get_u16(data[IFLA_VLAN_ID]); - if (id >= VLAN_VID_MASK) + if (id >= VLAN_VID_MASK) { + NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN id"); return -ERANGE; + } } if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); if ((flags->flags & flags->mask) & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | - VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) + VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags"); return -EINVAL; + } } err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_MOD(extack, "Invalid ingress QOS map"); return err; + } err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_MOD(extack, "Invalid egress QOS map"); return err; + } return 0; } @@ -126,14 +141,21 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, __be16 proto; int err; - if (!data[IFLA_VLAN_ID]) + if (!data[IFLA_VLAN_ID]) { + NL_SET_ERR_MSG_MOD(extack, "VLAN id not specified"); return -EINVAL; + } - if (!tb[IFLA_LINK]) + if (!tb[IFLA_LINK]) { + NL_SET_ERR_MSG_MOD(extack, "link not specified"); return -EINVAL; + } + real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); - if (!real_dev) + if (!real_dev) { + NL_SET_ERR_MSG_MOD(extack, "link does not exist"); return -ENODEV; + } if (data[IFLA_VLAN_PROTOCOL]) proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]); @@ -146,7 +168,8 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlan->flags = VLAN_FLAG_REORDER_HDR; - err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id); + err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id, + extack); if (err < 0) return err; diff --git a/net/Kconfig b/net/Kconfig index df8d45ef47d8..ba554cedb615 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -202,6 +202,8 @@ source "net/bridge/netfilter/Kconfig" endif +source "net/bpfilter/Kconfig" + source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/rds/Kconfig" diff --git a/net/Makefile b/net/Makefile index 77aaddedbd29..bdaf53925acd 100644 --- a/net/Makefile +++ b/net/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ obj-$(CONFIG_NET) += ipv6/ +obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index e4e2e02b7380..de8034d80623 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -35,7 +35,7 @@ config BATMAN_ADV config BATMAN_ADV_BATMAN_V bool "B.A.T.M.A.N. V protocol (experimental)" depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y) - default n + default y help This option enables the B.A.T.M.A.N. V protocol, the successor of the currently used B.A.T.M.A.N. IV protocol. The main @@ -94,13 +94,13 @@ config BATMAN_ADV_DEBUGFS bool "batman-adv debugfs entries" depends on BATMAN_ADV depends on DEBUG_FS - default y + default n help Enable this to export routing related debug tables via debugfs. The information for each soft-interface and used hard-interface can be found under batman_adv/ - If unsure, say Y. + If unsure, say N. config BATMAN_ADV_DEBUG bool "B.A.T.M.A.N. debugging" diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 28687493599f..71c20c1d4002 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -127,7 +127,20 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) rtnl_lock(); ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); rtnl_unlock(); - if (ret == 0) { + + /* Virtual interface drivers such as tun / tap interfaces, VLAN, etc + * tend to initialize the interface throughput with some value for the + * sake of having a throughput number to export via ethtool. This + * exported throughput leaves batman-adv to conclude the interface + * throughput is genuine (reflecting reality), thus no measurements + * are necessary. + * + * Based on the observation that those interface types also tend to set + * the link auto-negotiation to 'off', batman-adv shall check this + * setting to differentiate between genuine link throughput information + * and placeholders installed by virtual interfaces. + */ + if (ret == 0 && link_settings.base.autoneg == AUTONEG_ENABLE) { /* link characteristics might change over time */ if (link_settings.base.duplex == DUPLEX_FULL) hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 057a28a9fe88..8da3c9336111 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -25,7 +25,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2018.1" +#define BATADV_SOURCE_VERSION "2018.2" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index a11d3d89f012..36fd7b06c7cc 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -815,9 +815,6 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, if (!atomic_read(&bat_priv->multicast_mode)) return -EINVAL; - if (atomic_read(&bat_priv->mcast.num_disabled)) - return -EINVAL; - switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, @@ -1183,33 +1180,23 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); u8 mcast_flags = BATADV_NO_FLAGS; - bool orig_initialized; if (orig_mcast_enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags)) mcast_flags = *(u8 *)tvlv_value; + if (!orig_mcast_enabled) { + mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; + mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; + } + spin_lock_bh(&orig->mcast_handler_lock); - orig_initialized = test_bit(BATADV_ORIG_CAPA_HAS_MCAST, - &orig->capa_initialized); - /* If mcast support is turned on decrease the disabled mcast node - * counter only if we had increased it for this node before. If this - * is a completely new orig_node no need to decrease the counter. - */ if (orig_mcast_enabled && !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { - if (orig_initialized) - atomic_dec(&bat_priv->mcast.num_disabled); set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); - /* If mcast support is being switched off or if this is an initial - * OGM without mcast support then increase the disabled mcast - * node counter. - */ } else if (!orig_mcast_enabled && - (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) || - !orig_initialized)) { - atomic_inc(&bat_priv->mcast.num_disabled); + test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } @@ -1595,10 +1582,6 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) spin_lock_bh(&orig->mcast_handler_lock); - if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) && - test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized)) - atomic_dec(&bat_priv->mcast.num_disabled); - batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index edeffcb9f3a2..1485263a348b 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -188,8 +188,8 @@ static void batadv_interface_set_rx_mode(struct net_device *dev) { } -static int batadv_interface_tx(struct sk_buff *skb, - struct net_device *soft_iface) +static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, + struct net_device *soft_iface) { struct ethhdr *ethhdr; struct batadv_priv *bat_priv = netdev_priv(soft_iface); @@ -796,7 +796,6 @@ static int batadv_softif_init_late(struct net_device *dev) bat_priv->mcast.querier_ipv6.shadowing = false; bat_priv->mcast.flags = BATADV_NO_FLAGS; atomic_set(&bat_priv->multicast_mode, 1); - atomic_set(&bat_priv->mcast.num_disabled, 0); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 476b052ad982..360357f83f20 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -215,10 +215,12 @@ struct batadv_hard_iface { struct batadv_hard_iface_bat_v bat_v; #endif +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs */ struct dentry *debug_dir; +#endif /** * @neigh_list: list of unique single hop neighbors via this interface @@ -1160,13 +1162,13 @@ struct batadv_priv_dat { */ struct batadv_mcast_querier_state { /** @exists: whether a querier exists in the mesh */ - bool exists; + unsigned char exists:1; /** * @shadowing: if a querier exists, whether it is potentially shadowing * multicast listeners (i.e. querier is behind our own bridge segment) */ - bool shadowing; + unsigned char shadowing:1; }; /** @@ -1207,13 +1209,10 @@ struct batadv_priv_mcast { u8 flags; /** @enabled: whether the multicast tvlv is currently enabled */ - bool enabled; + unsigned char enabled:1; /** @bridged: whether the soft interface has a bridge on top */ - bool bridged; - - /** @num_disabled: number of nodes that have no mcast tvlv */ - atomic_t num_disabled; + unsigned char bridged:1; /** * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP @@ -1245,10 +1244,12 @@ struct batadv_priv_nc { /** @work: work queue callback item for cleanup */ struct delayed_work work; +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs */ struct dentry *debug_dir; +#endif /** * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq @@ -1392,7 +1393,7 @@ struct batadv_tp_vars { atomic_t dup_acks; /** @fast_recovery: true if in Fast Recovery mode */ - bool fast_recovery; + unsigned char fast_recovery:1; /** @recover: last sent seqno when entering Fast Recovery */ u32 recover; @@ -1601,8 +1602,10 @@ struct batadv_priv { /** @mesh_obj: kobject for sysfs mesh subdirectory */ struct kobject *mesh_obj; +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** @debug_dir: dentry for debugfs batman-adv subdirectory */ struct dentry *debug_dir; +#endif /** @forw_bat_list: list of aggregated OGMs that will be forwarded */ struct hlist_head forw_bat_list; @@ -2049,10 +2052,10 @@ struct batadv_skb_cb { * @decoded: Marks a skb as decoded, which is checked when searching for * coding opportunities in network-coding.c */ - bool decoded; + unsigned char decoded:1; /** @num_bcasts: Counter for broadcast packet retransmissions */ - unsigned int num_bcasts; + unsigned char num_bcasts; }; /** diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 40d260f2bea5..b0ee9edaae35 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3422,6 +3422,37 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, return 0; } +int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param) +{ + struct sk_buff *skb; + + if (hci_opcode_ogf(opcode) != 0x3f) { + /* A controller receiving a command shall respond with either + * a Command Status Event or a Command Complete Event. + * Therefore, all standard HCI commands must be sent via the + * standard API, using hci_send_cmd or hci_cmd_sync helpers. + * Some vendors do not comply with this rule for vendor-specific + * commands and do not return any event. We want to support + * unresponded commands for such cases only. + */ + bt_dev_err(hdev, "unresponded command not supported"); + return -EINVAL; + } + + skb = hci_prepare_cmd(hdev, opcode, plen, param); + if (!skb) { + bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)", + opcode); + return -ENOMEM; + } + + hci_send_frame(hdev, skb); + + return 0; +} +EXPORT_SYMBOL(__hci_cmd_send); + /* Get data from the previously sent command */ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 139707cd9d35..235b5aaab23d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4942,10 +4942,14 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) struct hci_ev_le_advertising_info *ev = ptr; s8 rssi; - rssi = ev->data[ev->length]; - process_adv_report(hdev, ev->evt_type, &ev->bdaddr, - ev->bdaddr_type, NULL, 0, rssi, - ev->data, ev->length); + if (ev->length <= HCI_MAX_AD_LENGTH) { + rssi = ev->data[ev->length]; + process_adv_report(hdev, ev->evt_type, &ev->bdaddr, + ev->bdaddr_type, NULL, 0, rssi, + ev->data, ev->length); + } else { + bt_dev_err(hdev, "Dropping invalid advertising data"); + } ptr += sizeof(*ev) + ev->length + 1; } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 66c0781773df..e44d34734834 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -122,7 +122,6 @@ void hci_req_sync_cancel(struct hci_dev *hdev, int err) struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout) { - DECLARE_WAITQUEUE(wait, current); struct hci_request req; struct sk_buff *skb; int err = 0; @@ -135,21 +134,14 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, hdev->req_status = HCI_REQ_PEND; - add_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_INTERRUPTIBLE); - err = hci_req_run_skb(&req, hci_req_sync_complete); - if (err < 0) { - remove_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_RUNNING); + if (err < 0) return ERR_PTR(err); - } - schedule_timeout(timeout); + err = wait_event_interruptible_timeout(hdev->req_wait_q, + hdev->req_status != HCI_REQ_PEND, timeout); - remove_wait_queue(&hdev->req_wait_q, &wait); - - if (signal_pending(current)) + if (err == -ERESTARTSYS) return ERR_PTR(-EINTR); switch (hdev->req_status) { @@ -197,7 +189,6 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, unsigned long opt, u32 timeout, u8 *hci_status) { struct hci_request req; - DECLARE_WAITQUEUE(wait, current); int err = 0; BT_DBG("%s start", hdev->name); @@ -213,16 +204,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, return err; } - add_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_INTERRUPTIBLE); - err = hci_req_run_skb(&req, hci_req_sync_complete); if (err < 0) { hdev->req_status = 0; - remove_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_RUNNING); - /* ENODATA means the HCI request command queue is empty. * This can happen when a request with conditionals doesn't * trigger any commands to be sent. This is normal behavior @@ -240,11 +225,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, return err; } - schedule_timeout(timeout); - - remove_wait_queue(&hdev->req_wait_q, &wait); + err = wait_event_interruptible_timeout(hdev->req_wait_q, + hdev->req_status != HCI_REQ_PEND, timeout); - if (signal_pending(current)) + if (err == -ERESTARTSYS) return -EINTR; switch (hdev->req_status) { diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig new file mode 100644 index 000000000000..a948b072c28f --- /dev/null +++ b/net/bpfilter/Kconfig @@ -0,0 +1,16 @@ +menuconfig BPFILTER + bool "BPF based packet filtering framework (BPFILTER)" + default n + depends on NET && BPF && INET + help + This builds experimental bpfilter framework that is aiming to + provide netfilter compatible functionality via BPF + +if BPFILTER +config BPFILTER_UMH + tristate "bpfilter kernel module with user mode helper" + default m + help + This builds bpfilter kernel module with embedded user mode helper +endif + diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile new file mode 100644 index 000000000000..2af752c8ef5e --- /dev/null +++ b/net/bpfilter/Makefile @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the Linux BPFILTER layer. +# + +hostprogs-y := bpfilter_umh +bpfilter_umh-objs := main.o +HOSTCFLAGS += -I. -Itools/include/ +ifeq ($(CONFIG_BPFILTER_UMH), y) +# builtin bpfilter_umh should be compiled with -static +# since rootfs isn't mounted at the time of __init +# function is called and do_execv won't find elf interpreter +HOSTLDFLAGS += -static +endif + +# a bit of elf magic to convert bpfilter_umh binary into a binary blob +# inside bpfilter_umh.o elf file referenced by +# _binary_net_bpfilter_bpfilter_umh_start symbol +# which bpfilter_kern.c passes further into umh blob loader at run-time +quiet_cmd_copy_umh = GEN $@ + cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \ + $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \ + -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \ + --rename-section .data=.init.rodata $< $@ + +$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh + $(call cmd,copy_umh) + +obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o +bpfilter-objs += bpfilter_kern.o bpfilter_umh.o diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c new file mode 100644 index 000000000000..7596314b61c7 --- /dev/null +++ b/net/bpfilter/bpfilter_kern.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/init.h> +#include <linux/module.h> +#include <linux/umh.h> +#include <linux/bpfilter.h> +#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/fs.h> +#include <linux/file.h> +#include "msgfmt.h" + +#define UMH_start _binary_net_bpfilter_bpfilter_umh_start +#define UMH_end _binary_net_bpfilter_bpfilter_umh_end + +extern char UMH_start; +extern char UMH_end; + +static struct umh_info info; +/* since ip_getsockopt() can run in parallel, serialize access to umh */ +static DEFINE_MUTEX(bpfilter_lock); + +static void shutdown_umh(struct umh_info *info) +{ + struct task_struct *tsk; + + tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID); + if (tsk) + force_sig(SIGKILL, tsk); + fput(info->pipe_to_umh); + fput(info->pipe_from_umh); +} + +static void __stop_umh(void) +{ + if (bpfilter_process_sockopt) { + bpfilter_process_sockopt = NULL; + shutdown_umh(&info); + } +} + +static void stop_umh(void) +{ + mutex_lock(&bpfilter_lock); + __stop_umh(); + mutex_unlock(&bpfilter_lock); +} + +static int __bpfilter_process_sockopt(struct sock *sk, int optname, + char __user *optval, + unsigned int optlen, bool is_set) +{ + struct mbox_request req; + struct mbox_reply reply; + loff_t pos; + ssize_t n; + int ret; + + req.is_set = is_set; + req.pid = current->pid; + req.cmd = optname; + req.addr = (long)optval; + req.len = optlen; + mutex_lock(&bpfilter_lock); + n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos); + if (n != sizeof(req)) { + pr_err("write fail %zd\n", n); + __stop_umh(); + ret = -EFAULT; + goto out; + } + pos = 0; + n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos); + if (n != sizeof(reply)) { + pr_err("read fail %zd\n", n); + __stop_umh(); + ret = -EFAULT; + goto out; + } + ret = reply.status; +out: + mutex_unlock(&bpfilter_lock); + return ret; +} + +static int __init load_umh(void) +{ + int err; + + /* fork usermode process */ + err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info); + if (err) + return err; + pr_info("Loaded bpfilter_umh pid %d\n", info.pid); + + /* health check that usermode process started correctly */ + if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) { + stop_umh(); + return -EFAULT; + } + bpfilter_process_sockopt = &__bpfilter_process_sockopt; + return 0; +} + +static void __exit fini_umh(void) +{ + stop_umh(); +} +module_init(load_umh); +module_exit(fini_umh); +MODULE_LICENSE("GPL"); diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c new file mode 100644 index 000000000000..1317f108df8a --- /dev/null +++ b/net/bpfilter/main.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <sys/uio.h> +#include <errno.h> +#include <stdio.h> +#include <sys/socket.h> +#include <fcntl.h> +#include <unistd.h> +#include "include/uapi/linux/bpf.h" +#include <asm/unistd.h> +#include "msgfmt.h" + +int debug_fd; + +static int handle_get_cmd(struct mbox_request *cmd) +{ + switch (cmd->cmd) { + case 0: + return 0; + default: + break; + } + return -ENOPROTOOPT; +} + +static int handle_set_cmd(struct mbox_request *cmd) +{ + return -ENOPROTOOPT; +} + +static void loop(void) +{ + while (1) { + struct mbox_request req; + struct mbox_reply reply; + int n; + + n = read(0, &req, sizeof(req)); + if (n != sizeof(req)) { + dprintf(debug_fd, "invalid request %d\n", n); + return; + } + + reply.status = req.is_set ? + handle_set_cmd(&req) : + handle_get_cmd(&req); + + n = write(1, &reply, sizeof(reply)); + if (n != sizeof(reply)) { + dprintf(debug_fd, "reply failed %d\n", n); + return; + } + } +} + +int main(void) +{ + debug_fd = open("/dev/console", 00000002); + dprintf(debug_fd, "Started bpfilter\n"); + loop(); + close(debug_fd); + return 0; +} diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h new file mode 100644 index 000000000000..98d121c62945 --- /dev/null +++ b/net/bpfilter/msgfmt.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_BPFILTER_MSGFMT_H +#define _NET_BPFILTER_MSGFMT_H + +struct mbox_request { + __u64 addr; + __u32 len; + __u32 is_set; + __u32 cmd; + __u32 pid; +}; + +struct mbox_reply { + __u32 status; +}; + +#endif diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 47ba98db145d..46c1fe7637ea 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -161,8 +161,8 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par) /* Make sure the match only receives stp frames */ if (!par->nft_compat && (!ether_addr_equal(e->destmac, eth_stp_addr) || - !is_broadcast_ether_addr(e->destmsk) || - !(e->bitmask & EBT_DESTMAC))) + !(e->bitmask & EBT_DESTMAC) || + !is_broadcast_ether_addr(e->destmsk))) return -EINVAL; return 0; diff --git a/net/core/dev.c b/net/core/dev.c index 7ca19f47a92a..1844d9bc5714 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2125,7 +2125,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev, int i, j; for (i = count, j = offset; i--; j++) { - if (!remove_xps_queue(dev_maps, cpu, j)) + if (!remove_xps_queue(dev_maps, tci, j)) break; } diff --git a/net/core/devlink.c b/net/core/devlink.c index ad1317376798..475246b355f0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -453,6 +453,27 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static int devlink_nl_port_attrs_put(struct sk_buff *msg, + struct devlink_port *devlink_port) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + + if (!attrs->set) + return 0; + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) + return -EMSGSIZE; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number)) + return -EMSGSIZE; + if (!attrs->split) + return 0; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number)) + return -EMSGSIZE; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, + attrs->split_subport_number)) + return -EMSGSIZE; + return 0; +} + static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, @@ -492,9 +513,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, ibdev->name)) goto nla_put_failure; } - if (devlink_port->split && - nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, - devlink_port->split_group)) + if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -2737,7 +2756,8 @@ static const struct genl_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_eswitch_set_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, @@ -2971,19 +2991,64 @@ void devlink_port_type_clear(struct devlink_port *devlink_port) EXPORT_SYMBOL_GPL(devlink_port_type_clear); /** - * devlink_port_split_set - Set port is split + * devlink_port_attrs_set - Set port attributes * * @devlink_port: devlink port - * @split_group: split group - identifies group split port is part of + * @flavour: flavour of the port + * @port_number: number of the port that is facing user, for example + * the front panel port number + * @split: indicates if this is split port + * @split_subport_number: if the port is split, this is the number + * of subport. */ -void devlink_port_split_set(struct devlink_port *devlink_port, - u32 split_group) -{ - devlink_port->split = true; - devlink_port->split_group = split_group; +void devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour, + u32 port_number, bool split, + u32 split_subport_number) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + + attrs->set = true; + attrs->flavour = flavour; + attrs->port_number = port_number; + attrs->split = split; + attrs->split_subport_number = split_subport_number; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } -EXPORT_SYMBOL_GPL(devlink_port_split_set); +EXPORT_SYMBOL_GPL(devlink_port_attrs_set); + +int devlink_port_get_phys_port_name(struct devlink_port *devlink_port, + char *name, size_t len) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int n = 0; + + if (!attrs->set) + return -EOPNOTSUPP; + + switch (attrs->flavour) { + case DEVLINK_PORT_FLAVOUR_PHYSICAL: + if (!attrs->split) + n = snprintf(name, len, "p%u", attrs->port_number); + else + n = snprintf(name, len, "p%us%u", attrs->port_number, + attrs->split_subport_number); + break; + case DEVLINK_PORT_FLAVOUR_CPU: + case DEVLINK_PORT_FLAVOUR_DSA: + /* As CPU and DSA ports do not have a netdevice associated + * case should not ever happen. + */ + WARN_ON(1); + return -EINVAL; + } + + if (n >= len) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, diff --git a/net/core/filter.c b/net/core/filter.c index 1d75f9322275..acf1f4fb99d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -653,11 +653,18 @@ do_pass: #define BPF_EMIT_JMP \ do { \ + const s32 off_min = S16_MIN, off_max = S16_MAX; \ + s32 off; \ + \ if (target >= len || target < 0) \ goto err; \ - insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ + off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ /* Adjust pc relative offset for 2nd or 3rd insn. */ \ - insn->off -= insn - tmp_insns; \ + off -= insn - tmp_insns; \ + /* Reject anything not fitting into insn->off. */ \ + if (off < off_min || off > off_max) \ + goto err; \ + insn->off = off; \ } while (0) case BPF_JMP | BPF_JA: diff --git a/net/core/sock.c b/net/core/sock.c index 042cfc612660..435a0ba85e52 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1613,7 +1613,7 @@ static void __sk_free(struct sock *sk) if (likely(sk->sk_net_refcnt)) sock_inuse_add(sock_net(sk), -1); - if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) + if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) sock_diag_broadcast_destroy(sk); else sk_destruct(sk); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index adf50fbc4c13..dc5d9af3dc80 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -258,11 +258,13 @@ static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst) static int dsa_port_setup(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; - int err; + int err = 0; memset(&dp->devlink_port, 0, sizeof(dp->devlink_port)); - err = devlink_port_register(ds->devlink, &dp->devlink_port, dp->index); + if (dp->type != DSA_PORT_TYPE_UNUSED) + err = devlink_port_register(ds->devlink, &dp->devlink_port, + dp->index); if (err) return err; @@ -270,7 +272,28 @@ static int dsa_port_setup(struct dsa_port *dp) case DSA_PORT_TYPE_UNUSED: break; case DSA_PORT_TYPE_CPU: + /* dp->index is used now as port_number. However + * CPU ports should have separate numbering + * independent from front panel port numbers. + */ + devlink_port_attrs_set(&dp->devlink_port, + DEVLINK_PORT_FLAVOUR_CPU, + dp->index, false, 0); + err = dsa_port_link_register_of(dp); + if (err) { + dev_err(ds->dev, "failed to setup link for port %d.%d\n", + ds->index, dp->index); + return err; + } + break; case DSA_PORT_TYPE_DSA: + /* dp->index is used now as port_number. However + * DSA ports should have separate numbering + * independent from front panel port numbers. + */ + devlink_port_attrs_set(&dp->devlink_port, + DEVLINK_PORT_FLAVOUR_DSA, + dp->index, false, 0); err = dsa_port_link_register_of(dp); if (err) { dev_err(ds->dev, "failed to setup link for port %d.%d\n", @@ -279,6 +302,9 @@ static int dsa_port_setup(struct dsa_port *dp) } break; case DSA_PORT_TYPE_USER: + devlink_port_attrs_set(&dp->devlink_port, + DEVLINK_PORT_FLAVOUR_PHYSICAL, + dp->index, false, 0); err = dsa_slave_create(dp); if (err) dev_err(ds->dev, "failed to create slave for port %d.%d\n", @@ -293,7 +319,8 @@ static int dsa_port_setup(struct dsa_port *dp) static void dsa_port_teardown(struct dsa_port *dp) { - devlink_port_unregister(&dp->devlink_port); + if (dp->type != DSA_PORT_TYPE_UNUSED) + devlink_port_unregister(&dp->devlink_port); switch (dp->type) { case DSA_PORT_TYPE_UNUSED: diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index b379520f9133..eec9569ffa5c 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -14,7 +14,9 @@ obj-y := route.o inetpeer.o protocol.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ - metrics.o + metrics.o netlink.o + +obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile new file mode 100644 index 000000000000..ce262d76cc48 --- /dev/null +++ b/net/ipv4/bpfilter/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_BPFILTER) += sockopt.o + diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c new file mode 100644 index 000000000000..42a96d2d8d05 --- /dev/null +++ b/net/ipv4/bpfilter/sockopt.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/uaccess.h> +#include <linux/bpfilter.h> +#include <uapi/linux/bpf.h> +#include <linux/wait.h> +#include <linux/kmod.h> + +int (*bpfilter_process_sockopt)(struct sock *sk, int optname, + char __user *optval, + unsigned int optlen, bool is_set); +EXPORT_SYMBOL_GPL(bpfilter_process_sockopt); + +int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval, + unsigned int optlen, bool is_set) +{ + if (!bpfilter_process_sockopt) { + int err = request_module("bpfilter"); + + if (err) + return err; + if (!bpfilter_process_sockopt) + return -ECHILD; + } + return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set); +} + +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, + unsigned int optlen) +{ + return bpfilter_mbox_request(sk, optname, optval, optlen, true); +} + +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, + int __user *optlen) +{ + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + return bpfilter_mbox_request(sk, optname, optval, len, false); +} diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f05afaf3235c..897ae92dff0f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -326,10 +326,11 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, int rpf, struct in_device *idev, u32 *itag) { + struct net *net = dev_net(dev); + struct flow_keys flkeys; int ret, no_addr; struct fib_result res; struct flowi4 fl4; - struct net *net = dev_net(dev); bool dev_match; fl4.flowi4_oif = 0; @@ -347,6 +348,11 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, no_addr = idev->ifa_list == NULL; fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; + if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) { + fl4.flowi4_proto = 0; + fl4.fl4_sport = 0; + fl4.fl4_dport = 0; + } trace_fib_validate_source(dev, &fl4); @@ -643,6 +649,9 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, + [RTA_IP_PROTO] = { .type = NLA_U8 }, + [RTA_SPORT] = { .type = NLA_U16 }, + [RTA_DPORT] = { .type = NLA_U16 }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2409e648454d..2d8efeecf619 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -734,10 +734,12 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, truncate, true); - else + else if (tunnel->erspan_ver == 2) erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key), tunnel->dir, tunnel->hwid, truncate, true); + else + goto free_skb; tunnel->parms.o_flags &= ~TUNNEL_KEY; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b5e21eb198d8..af5a830ff6ad 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1053,7 +1053,8 @@ alloc_new_skb: if (copy > length) copy = length; - if (!(rt->dst.dev->features&NETIF_F_SG)) { + if (!(rt->dst.dev->features&NETIF_F_SG) && + skb_tailroom(skb) >= copy) { unsigned int off; off = skb->len; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 5ad2d8ed3a3f..e0791faacb24 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -47,6 +47,8 @@ #include <linux/errqueue.h> #include <linux/uaccess.h> +#include <linux/bpfilter.h> + /* * SOL_IP control messages. */ @@ -1244,6 +1246,11 @@ int ip_setsockopt(struct sock *sk, int level, return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); +#ifdef CONFIG_BPFILTER + if (optname >= BPFILTER_IPT_SO_SET_REPLACE && + optname < BPFILTER_IPT_SET_MAX) + err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); +#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && @@ -1552,6 +1559,11 @@ int ip_getsockopt(struct sock *sk, int level, int err; err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0); +#ifdef CONFIG_BPFILTER + if (optname >= BPFILTER_IPT_SO_GET_INFO && + optname < BPFILTER_IPT_GET_MAX) + err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); +#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && @@ -1584,6 +1596,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, err = do_ip_getsockopt(sk, level, optname, optval, optlen, MSG_CMSG_COMPAT); +#ifdef CONFIG_BPFILTER + if (optname >= BPFILTER_IPT_SO_GET_INFO && + optname < BPFILTER_IPT_GET_MAX) + err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); +#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 444f125f3974..38ab97b0a2ec 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -34,6 +34,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv4 packet filter"); +MODULE_ALIAS("ipt_icmp"); void *ipt_alloc_initial_table(const struct xt_table *info) { @@ -1782,6 +1783,8 @@ int ipt_register_table(struct net *net, const struct xt_table *table, /* set res now, will see skbs right after nf_register_net_hooks */ WRITE_ONCE(*res, new_table); + if (!ops) + return 0; ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); if (ret != 0) { @@ -1799,7 +1802,8 @@ out_free: void ipt_unregister_table(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops) { - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ops) + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); __ipt_unregister_table(net, table); } diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index fd01f13c896a..12843c9ef142 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -89,10 +89,10 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) return true ^ invert; } + memset(&flow, 0, sizeof(flow)); flow.flowi4_iif = LOOPBACK_IFINDEX; flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); - flow.flowi4_oif = 0; flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; flow.flowi4_tos = RT_TOS(iph->tos); flow.flowi4_scope = RT_SCOPE_UNIVERSE; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 529d89ec31e8..a317445448bf 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -38,69 +38,58 @@ static unsigned int iptable_nat_do_chain(void *priv, return ipt_do_table(skb, state, state->net->ipv4.nat_table); } -static unsigned int iptable_nat_ipv4_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain); -} - -static unsigned int iptable_nat_ipv4_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain); -} - -static unsigned int iptable_nat_ipv4_out(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain); -} - -static unsigned int iptable_nat_ipv4_local_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain); -} - static const struct nf_hook_ops nf_nat_ipv4_ops[] = { - /* Before packet filtering, change destination */ { - .hook = iptable_nat_ipv4_in, + .hook = iptable_nat_do_chain, .pf = NFPROTO_IPV4, - .nat_hook = true, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, - /* After packet filtering, change source */ { - .hook = iptable_nat_ipv4_out, + .hook = iptable_nat_do_chain, .pf = NFPROTO_IPV4, - .nat_hook = true, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, - /* Before packet filtering, change destination */ { - .hook = iptable_nat_ipv4_local_fn, + .hook = iptable_nat_do_chain, .pf = NFPROTO_IPV4, - .nat_hook = true, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, - /* After packet filtering, change source */ { - .hook = iptable_nat_ipv4_fn, + .hook = iptable_nat_do_chain, .pf = NFPROTO_IPV4, - .nat_hook = true, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, }; +static int ipt_nat_register_lookups(struct net *net) +{ + int i, ret; + + for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) { + ret = nf_nat_l3proto_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]); + if (ret) { + while (i) + nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]); + + return ret; + } + } + + return 0; +} + +static void ipt_nat_unregister_lookups(struct net *net) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) + nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]); +} + static int __net_init iptable_nat_table_init(struct net *net) { struct ipt_replace *repl; @@ -113,7 +102,18 @@ static int __net_init iptable_nat_table_init(struct net *net) if (repl == NULL) return -ENOMEM; ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, - nf_nat_ipv4_ops, &net->ipv4.nat_table); + NULL, &net->ipv4.nat_table); + if (ret < 0) { + kfree(repl); + return ret; + } + + ret = ipt_nat_register_lookups(net); + if (ret < 0) { + ipt_unregister_table(net, net->ipv4.nat_table, NULL); + net->ipv4.nat_table = NULL; + } + kfree(repl); return ret; } @@ -122,7 +122,8 @@ static void __net_exit iptable_nat_net_exit(struct net *net) { if (!net->ipv4.nat_table) return; - ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops); + ipt_nat_unregister_lookups(net); + ipt_unregister_table(net, net->ipv4.nat_table, NULL); net->ipv4.nat_table = NULL; } diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 325e02956bf5..6115bf1ff6f0 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -241,33 +241,18 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); -unsigned int +static unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)) + const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; - struct nf_conn_nat *nat; - /* maniptype == SRC for postrouting. */ - enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); ct = nf_ct_get(skb, &ctinfo); - /* Can't track? It's not due to stress, or conntrack would - * have dropped it. Hence it's the user's responsibilty to - * packet filter it out, or implement conntrack/NAT for that - * protocol. 8) --RR - */ if (!ct) return NF_ACCEPT; - nat = nfct_nat(ct); - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: + if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, state->hook)) @@ -275,76 +260,30 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, else return NF_ACCEPT; } - /* Only ICMPs can be IP_CT_IS_REPLY: */ - /* fall through */ - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - unsigned int ret; - - ret = do_chain(priv, skb, state); - if (ret != NF_ACCEPT) - return ret; - - if (nf_nat_initialized(ct, HOOK2MANIP(state->hook))) - break; - - ret = nf_nat_alloc_null_binding(ct, state->hook); - if (ret != NF_ACCEPT) - return ret; - } else { - pr_debug("Already setup manip %s for ct %p\n", - maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", - ct); - if (nf_nat_oif_changed(state->hook, ctinfo, nat, - state->out)) - goto oif_changed; - } - break; - - default: - /* ESTABLISHED */ - WARN_ON(ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) - goto oif_changed; } - return nf_nat_packet(ct, ctinfo, state->hook, skb); - -oif_changed: - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_DROP; + return nf_nat_inet_fn(priv, skb, state); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); -unsigned int +static unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)) + const struct nf_hook_state *state) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; - ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_DROP && ret != NF_STOLEN && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); return ret; } -EXPORT_SYMBOL_GPL(nf_nat_ipv4_in); -unsigned int +static unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)) + const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; @@ -353,7 +292,7 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb, #endif unsigned int ret; - ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && @@ -373,21 +312,17 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb, #endif return ret; } -EXPORT_SYMBOL_GPL(nf_nat_ipv4_out); -unsigned int +static unsigned int nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)) + const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; - ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); + ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -411,7 +346,49 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, } return ret; } -EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn); + +static const struct nf_hook_ops nf_nat_ipv4_ops[] = { + /* Before packet filtering, change destination */ + { + .hook = nf_nat_ipv4_in, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_ipv4_out, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC, + }, + /* Before packet filtering, change destination */ + { + .hook = nf_nat_ipv4_local_fn, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_ipv4_fn, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC, + }, +}; + +int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops) +{ + return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); +} +EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_register_fn); + +void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops) +{ + nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); +} +EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn); static int __init nf_nat_l3proto_ipv4_init(void) { diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index 285baccfbdea..a3c4ea303e3e 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -27,8 +27,8 @@ #include <net/ip.h> static unsigned int nft_nat_do_chain(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) + struct sk_buff *skb, + const struct nf_hook_state *state) { struct nft_pktinfo pkt; @@ -38,42 +38,14 @@ static unsigned int nft_nat_do_chain(void *priv, return nft_do_chain(&pkt, priv); } -static unsigned int nft_nat_ipv4_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain); -} - -static unsigned int nft_nat_ipv4_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain); -} - -static unsigned int nft_nat_ipv4_out(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain); -} - -static unsigned int nft_nat_ipv4_local_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain); -} - -static int nft_nat_ipv4_init(struct nft_ctx *ctx) +static int nft_nat_ipv4_reg(struct net *net, const struct nf_hook_ops *ops) { - return nf_ct_netns_get(ctx->net, ctx->family); + return nf_nat_l3proto_ipv4_register_fn(net, ops); } -static void nft_nat_ipv4_free(struct nft_ctx *ctx) +static void nft_nat_ipv4_unreg(struct net *net, const struct nf_hook_ops *ops) { - nf_ct_netns_put(ctx->net, ctx->family); + nf_nat_l3proto_ipv4_unregister_fn(net, ops); } static const struct nft_chain_type nft_chain_nat_ipv4 = { @@ -86,13 +58,13 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = { (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .hooks = { - [NF_INET_PRE_ROUTING] = nft_nat_ipv4_in, - [NF_INET_POST_ROUTING] = nft_nat_ipv4_out, - [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn, - [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn, + [NF_INET_PRE_ROUTING] = nft_nat_do_chain, + [NF_INET_POST_ROUTING] = nft_nat_do_chain, + [NF_INET_LOCAL_OUT] = nft_nat_do_chain, + [NF_INET_LOCAL_IN] = nft_nat_do_chain, }, - .init = nft_nat_ipv4_init, - .free = nft_nat_ipv4_free, + .ops_register = nft_nat_ipv4_reg, + .ops_unregister = nft_nat_ipv4_unreg, }; static int __init nft_chain_nat_init(void) diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c new file mode 100644 index 000000000000..f86bb4f06609 --- /dev/null +++ b/net/ipv4/netlink.c @@ -0,0 +1,23 @@ +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <linux/types.h> +#include <net/net_namespace.h> +#include <net/netlink.h> +#include <net/ip.h> + +int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, + struct netlink_ext_ack *extack) +{ + *ip_proto = nla_get_u8(attr); + + switch (*ip_proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_ICMP: + return 0; + default: + NL_SET_ERR_MSG(extack, "Unsupported ip proto"); + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 261b71d0ccc5..6c1ff89a60fa 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -298,6 +298,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), + SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ac3b22bc51b2..45ad2585eb28 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1992,8 +1992,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); - if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) + if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { flkeys = &_flkeys; + } else { + fl4.flowi4_proto = 0; + fl4.fl4_sport = 0; + fl4.fl4_dport = 0; + } err = fib_lookup(net, &fl4, res, 0); if (err != 0) { @@ -2600,11 +2605,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, EXPORT_SYMBOL_GPL(ip_route_output_flow); /* called with rcu_read_lock held */ -static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, - struct flowi4 *fl4, struct sk_buff *skb, u32 portid, - u32 seq) +static int rt_fill_info(struct net *net, __be32 dst, __be32 src, + struct rtable *rt, u32 table_id, struct flowi4 *fl4, + struct sk_buff *skb, u32 portid, u32 seq) { - struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; @@ -2700,7 +2704,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, } } else #endif - if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) + if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) goto nla_put_failure; } @@ -2715,43 +2719,93 @@ nla_put_failure: return -EMSGSIZE; } +static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, + u8 ip_proto, __be16 sport, + __be16 dport) +{ + struct sk_buff *skb; + struct iphdr *iph; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return NULL; + + /* Reserve room for dummy headers, this skb can pass + * through good chunk of routing engine. + */ + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb->protocol = htons(ETH_P_IP); + iph = skb_put(skb, sizeof(struct iphdr)); + iph->protocol = ip_proto; + iph->saddr = src; + iph->daddr = dst; + iph->version = 0x4; + iph->frag_off = 0; + iph->ihl = 0x5; + skb_set_transport_header(skb, skb->len); + + switch (iph->protocol) { + case IPPROTO_UDP: { + struct udphdr *udph; + + udph = skb_put_zero(skb, sizeof(struct udphdr)); + udph->source = sport; + udph->dest = dport; + udph->len = sizeof(struct udphdr); + udph->check = 0; + break; + } + case IPPROTO_TCP: { + struct tcphdr *tcph; + + tcph = skb_put_zero(skb, sizeof(struct tcphdr)); + tcph->source = sport; + tcph->dest = dport; + tcph->doff = sizeof(struct tcphdr) / 4; + tcph->rst = 1; + tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), + src, dst, 0); + break; + } + case IPPROTO_ICMP: { + struct icmphdr *icmph; + + icmph = skb_put_zero(skb, sizeof(struct icmphdr)); + icmph->type = ICMP_ECHO; + icmph->code = 0; + } + } + + return skb; +} + static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); - struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; + u32 table_id = RT_TABLE_MAIN; + __be16 sport = 0, dport = 0; struct fib_result res = {}; + u8 ip_proto = IPPROTO_UDP; struct rtable *rt = NULL; + struct sk_buff *skb; + struct rtmsg *rtm; struct flowi4 fl4; __be32 dst = 0; __be32 src = 0; + kuid_t uid; u32 iif; int err; int mark; - struct sk_buff *skb; - u32 table_id = RT_TABLE_MAIN; - kuid_t uid; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) - goto errout; + return err; rtm = nlmsg_data(nlh); - - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) { - err = -ENOBUFS; - goto errout; - } - - /* Reserve room for dummy headers, this skb can pass - through good chunk of routing engine. - */ - skb_reset_mac_header(skb); - skb_reset_network_header(skb); - src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; @@ -2761,14 +2815,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, else uid = (iif ? INVALID_UID : current_uid()); - /* Bugfix: need to give ip_route_input enough of an IP header to - * not gag. - */ - ip_hdr(skb)->protocol = IPPROTO_UDP; - ip_hdr(skb)->saddr = src; - ip_hdr(skb)->daddr = dst; + if (tb[RTA_IP_PROTO]) { + err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], + &ip_proto, extack); + if (err) + return err; + } + + if (tb[RTA_SPORT]) + sport = nla_get_be16(tb[RTA_SPORT]); - skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); + if (tb[RTA_DPORT]) + dport = nla_get_be16(tb[RTA_DPORT]); + + skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); + if (!skb) + return -ENOBUFS; memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; @@ -2777,6 +2839,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; + if (sport) + fl4.fl4_sport = sport; + if (dport) + fl4.fl4_dport = dport; + fl4.flowi4_proto = ip_proto; rcu_read_lock(); @@ -2786,10 +2853,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, dev = dev_get_by_index_rcu(net, iif); if (!dev) { err = -ENODEV; - goto errout_free; + goto errout_rcu; } - skb->protocol = htons(ETH_P_IP); + fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, @@ -2809,7 +2876,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, } if (err) - goto errout_free; + goto errout_rcu; if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; @@ -2817,34 +2884,40 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = res.table ? res.table->tb_id : 0; + /* reset skb for netlink reply msg */ + skb_trim(skb, 0); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_reset_mac_header(skb); + if (rtm->rtm_flags & RTM_F_FIB_MATCH) { if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; - goto errout_free; + goto errout_rcu; } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, table_id, rt->rt_type, res.prefix, res.prefixlen, fl4.flowi4_tos, res.fi, 0); } else { - err = rt_fill_info(net, dst, src, table_id, &fl4, skb, + err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); } if (err < 0) - goto errout_free; + goto errout_rcu; rcu_read_unlock(); err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); -errout: - return err; errout_free: + return err; +errout_rcu: rcu_read_unlock(); kfree_skb(skb); - goto errout; + goto errout_free; } void ip_rt_multicast_event(struct in_device *in_dev) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4b195bac8ac0..d2eed3ddcb0a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -46,6 +46,7 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +static int comp_sack_nr_max = 255; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1152,6 +1153,22 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &one, }, { + .procname = "tcp_comp_sack_delay_ns", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "tcp_comp_sack_nr", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &comp_sack_nr_max, + }, + { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 62b776f90037..0a2ea0bbf867 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags) dst_release(sk->sk_rx_dst); sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); + tp->compressed_ack = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b188e0d75edd..1191cac72109 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -203,21 +203,23 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) } } -static void tcp_incr_quickack(struct sock *sk) +static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks == 0) quickacks = 2; + quickacks = min(quickacks, max_quickacks); if (quickacks > icsk->icsk_ack.quick) - icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); + icsk->icsk_ack.quick = quickacks; } -static void tcp_enter_quickack_mode(struct sock *sk) +static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); - tcp_incr_quickack(sk); + + tcp_incr_quickack(sk, max_quickacks); icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } @@ -261,7 +263,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) - tcp_enter_quickack_mode((struct sock *)tp); + tcp_enter_quickack_mode((struct sock *)tp, 1); break; case INET_ECN_CE: if (tcp_ca_needs_ecn((struct sock *)tp)) @@ -269,7 +271,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ - tcp_enter_quickack_mode((struct sock *)tp); + tcp_enter_quickack_mode((struct sock *)tp, 1); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } tp->ecn_flags |= TCP_ECN_SEEN; @@ -686,7 +688,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) /* The _first_ data packet received, initialize * delayed ACK engine. */ - tcp_incr_quickack(sk); + tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); icsk->icsk_ack.ato = TCP_ATO_MIN; } else { int m = now - icsk->icsk_ack.lrcvtime; @@ -702,7 +704,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ - tcp_incr_quickack(sk); + tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); sk_mem_reclaim(sk); } } @@ -1917,19 +1919,54 @@ static inline void tcp_init_undo(struct tcp_sock *tp) tp->undo_retrans = tp->retrans_out ? : -1; } -/* Enter Loss state. If we detect SACK reneging, forget all SACK information +static bool tcp_is_rack(const struct sock *sk) +{ + return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION; +} + +/* If we detect SACK reneging, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. */ +static void tcp_timeout_mark_lost(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb, *head; + bool is_reneg; /* is receiver reneging on SACKs? */ + + head = tcp_rtx_queue_head(sk); + is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED); + if (is_reneg) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); + tp->sacked_out = 0; + /* Mark SACK reneging until we recover from this loss event. */ + tp->is_sack_reneg = 1; + } else if (tcp_is_reno(tp)) { + tcp_reset_reno_sack(tp); + } + + skb = head; + skb_rbtree_walk_from(skb) { + if (is_reneg) + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; + else if (tcp_is_rack(sk) && skb != head && + tcp_rack_skb_timeout(tp, skb, 0) > 0) + continue; /* Don't mark recently sent ones lost yet */ + tcp_mark_skb_lost(sk, skb); + } + tcp_verify_left_out(tp); + tcp_clear_all_retrans_hints(tp); +} + +/* Enter Loss state. */ void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - struct sk_buff *skb; bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; - bool is_reneg; /* is receiver reneging on SACKs? */ - bool mark_lost; + + tcp_timeout_mark_lost(sk); /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || @@ -1941,40 +1978,10 @@ void tcp_enter_loss(struct sock *sk) tcp_ca_event(sk, CA_EVENT_LOSS); tcp_init_undo(tp); } - tp->snd_cwnd = 1; + tp->snd_cwnd = tcp_packets_in_flight(tp) + 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_jiffies32; - tp->retrans_out = 0; - tp->lost_out = 0; - - if (tcp_is_reno(tp)) - tcp_reset_reno_sack(tp); - - skb = tcp_rtx_queue_head(sk); - is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); - if (is_reneg) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); - tp->sacked_out = 0; - /* Mark SACK reneging until we recover from this loss event. */ - tp->is_sack_reneg = 1; - } - tcp_clear_all_retrans_hints(tp); - - skb_rbtree_walk_from(skb) { - mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || - is_reneg); - if (mark_lost) - tcp_sum_lost(tp, skb); - TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; - if (mark_lost) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - } - } - tcp_verify_left_out(tp); - /* Timeout in disordered state after receiving substantial DUPACKs * suggests that the degree of reordering is over-estimated. */ @@ -2141,7 +2148,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) return true; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_dupack_heuristics(tp) > tp->reordering) + if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering) return true; return false; @@ -2218,9 +2225,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_reno(tp)) { - tcp_mark_head_lost(sk, 1, 1); - } else { + if (tcp_is_sack(tp)) { int sacked_upto = tp->sacked_out - tp->reordering; if (sacked_upto >= 0) tcp_mark_head_lost(sk, sacked_upto, 0); @@ -2718,12 +2723,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) return false; } -static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) +static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) { struct tcp_sock *tp = tcp_sk(sk); - /* Use RACK to detect loss */ - if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { + if (tcp_rtx_queue_empty(sk)) + return; + + if (unlikely(tcp_is_reno(tp))) { + tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED); + } else if (tcp_is_rack(sk)) { u32 prior_retrans = tp->retrans_out; tcp_rack_mark_lost(sk); @@ -2819,11 +2828,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tcp_try_keep_open(sk); return; } - tcp_rack_identify_loss(sk, ack_flag); + tcp_identify_packet_loss(sk, ack_flag); break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack, rexmit); - tcp_rack_identify_loss(sk, ack_flag); + tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) return; @@ -2840,7 +2849,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); - tcp_rack_identify_loss(sk, ack_flag); + tcp_identify_packet_loss(sk, ack_flag); if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; @@ -2862,7 +2871,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, fast_rexmit = 1; } - if (do_lost) + if (!tcp_is_rack(sk) && do_lost) tcp_update_scoreboard(sk, fast_rexmit); *rexmit = REXMIT_LOST; } @@ -4172,7 +4181,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); - tcp_enter_quickack_mode(sk); + tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -4242,6 +4251,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { + if (tp->compressed_ack) + tcp_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; sp--; @@ -4697,7 +4708,7 @@ queue_and_out: tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: - tcp_enter_quickack_mode(sk); + tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_schedule_ack(sk); drop: tcp_drop(sk, skb); @@ -4708,8 +4719,6 @@ drop: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) goto out_of_window; - tcp_enter_quickack_mode(sk); - if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", @@ -5076,6 +5085,7 @@ static inline void tcp_data_snd_check(struct sock *sk) static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_sock *tp = tcp_sk(sk); + unsigned long rtt, delay; /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && @@ -5087,15 +5097,36 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || __tcp_select_window(sk) >= tp->rcv_wnd)) || /* We ACK each frame or... */ - tcp_in_quickack_mode(sk) || - /* We have out of order data. */ - (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { - /* Then ack it now */ + tcp_in_quickack_mode(sk)) { +send_now: tcp_send_ack(sk); - } else { - /* Else, send delayed ack. */ + return; + } + + if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_send_delayed_ack(sk); + return; } + + if (!tcp_is_sack(tp) || + tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) + goto send_now; + tp->compressed_ack++; + + if (hrtimer_is_queued(&tp->compressed_ack_timer)) + return; + + /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */ + + rtt = tp->rcv_rtt_est.rtt_us; + if (tp->srtt_us && tp->srtt_us < rtt) + rtt = tp->srtt_us; + + delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, + rtt * (NSEC_PER_USEC >> 3)/20); + sock_hold(sk); + hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), + HRTIMER_MODE_REL_PINNED_SOFT); } static inline void tcp_ack_snd_check(struct sock *sk) @@ -5761,7 +5792,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * to stand against the temptation 8) --ANK */ inet_csk_schedule_ack(sk); - tcp_enter_quickack_mode(sk); + tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index caf23de88f8a..adbdb503db0c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2572,6 +2572,8 @@ static int __net_init tcp_sk_init(struct net *net) init_net.ipv4.sysctl_tcp_wmem, sizeof(init_net.ipv4.sysctl_tcp_wmem)); } + net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; + net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0d8f950a9006..8e08b409c71e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -162,6 +162,15 @@ static void tcp_event_data_sent(struct tcp_sock *tp, /* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { + struct tcp_sock *tp = tcp_sk(sk); + + if (unlikely(tp->compressed_ack)) { + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack); + tp->compressed_ack = 0; + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) + __sock_put(sk); + } tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -2814,8 +2823,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) return -EBUSY; if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { - if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) - BUG(); + if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { + WARN_ON_ONCE(1); + return -EINVAL; + } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return -ENOMEM; } @@ -3323,6 +3334,7 @@ static void tcp_connect_init(struct sock *sk) sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; tcp_init_wl(tp, 0); + tcp_write_queue_purge(sk); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 3a81720ac0c4..71593e4400ab 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -2,7 +2,7 @@ #include <linux/tcp.h> #include <net/tcp.h> -static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -21,6 +21,38 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } +static u32 tcp_rack_reo_wnd(const struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->rack.reord) { + /* If reordering has not been observed, be aggressive during + * the recovery or starting the recovery by DUPACK threshold. + */ + if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery) + return 0; + + if (tp->sacked_out >= tp->reordering && + !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH)) + return 0; + } + + /* To be more reordering resilient, allow min_rtt/4 settling delay. + * Use min_rtt instead of the smoothed RTT because reordering is + * often a path property and less related to queuing or delayed ACKs. + * Upon receiving DSACKs, linearly increase the window up to the + * smoothed RTT. + */ + return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps, + tp->srtt_us >> 3); +} + +s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd) +{ + return tp->rack.rtt_us + reo_wnd - + tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); +} + /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): * * Marks a packet lost, if some packet sent later has been (s)acked. @@ -44,23 +76,11 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); - u32 min_rtt = tcp_min_rtt(tp); struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; - /* To be more reordering resilient, allow min_rtt/4 settling delay - * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed - * RTT because reordering is often a path property and less related - * to queuing or delayed ACKs. - */ - reo_wnd = 1000; - if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) && - min_rtt != ~0U) { - reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd); - reo_wnd = min(reo_wnd, tp->srtt_us >> 3); - } - + reo_wnd = tcp_rack_reo_wnd(sk); list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); @@ -78,10 +98,9 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) /* A packet is lost if it has not been s/acked beyond * the recent RTT plus the reordering window. */ - remaining = tp->rack.rtt_us + reo_wnd - - tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd); if (remaining <= 0) { - tcp_rack_mark_skb_lost(sk, skb); + tcp_mark_skb_lost(sk, skb); list_del_init(&skb->tcp_tsorted_anchor); } else { /* Record maximum wait time */ @@ -202,3 +221,30 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) tp->rack.reo_wnd_steps = 1; } } + +/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits + * the next unacked packet upon receiving + * a) three or more DUPACKs to start the fast recovery + * b) an ACK acknowledging new data during the fast recovery. + */ +void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced) +{ + const u8 state = inet_csk(sk)->icsk_ca_state; + struct tcp_sock *tp = tcp_sk(sk); + + if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) || + (state == TCP_CA_Recovery && snd_una_advanced)) { + struct sk_buff *skb = tcp_rtx_queue_head(sk); + u32 mss; + + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) + return; + + mss = tcp_skb_mss(skb); + if (tcp_skb_pcount(skb) > 1 && skb->len > mss) + tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + mss, mss, GFP_ATOMIC); + + tcp_skb_mark_lost_uncond_verify(tp, skb); + } +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 92bdf64fffae..3b3611729928 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -708,6 +708,27 @@ out: sock_put(sk); } +static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) +{ + struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer); + struct sock *sk = (struct sock *)tp; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + if (tp->compressed_ack) + tcp_send_ack(sk); + } else { + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, + &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); + + sock_put(sk); + + return HRTIMER_NORESTART; +} + void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, @@ -715,4 +736,8 @@ void tcp_init_xmit_timers(struct sock *sk) hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_SOFT); tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; + + hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED_SOFT); + tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ff4d4ba67735..d71f1f3e1155 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -788,7 +788,8 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, return -EINVAL; if (sk->sk_no_check_tx) return -EINVAL; - if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite) + if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || + dst_xfrm(skb_dst(skb))) return -EIO; skb_shinfo(skb)->gso_size = cork->gso_size; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d1dc6017f5a6..f9132a6de917 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -934,19 +934,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); - struct fib6_info *iter = NULL; + struct fib6_info *iter = NULL, *match = NULL; struct fib6_info __rcu **ins; - struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); + int append = (info->nlh && + (info->nlh->nlmsg_flags & NLM_F_APPEND)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; - bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); u16 nlflags = NLM_F_EXCL; int err; - if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) + if (append) nlflags |= NLM_F_APPEND; ins = &fn->leaf; @@ -968,13 +968,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, nlflags &= ~NLM_F_EXCL; if (replace) { - if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { - found++; - break; - } - if (rt_can_ecmp) - fallback_ins = fallback_ins ?: ins; - goto next_iter; + found++; + break; } if (rt6_duplicate_nexthop(iter, rt)) { @@ -989,86 +984,67 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } - /* If we have the same destination and the same metric, - * but not the same gateway, then the route we try to - * add is sibling to this route, increment our counter - * of siblings, and later we will add our route to the - * list. - * Only static routes (which don't have flag - * RTF_EXPIRES) are used for ECMPv6. - * - * To avoid long list, we only had siblings if the - * route have a gateway. - */ - if (rt_can_ecmp && - rt6_qualify_for_ecmp(iter)) - rt->fib6_nsiblings++; + + /* first route that matches */ + if (!match) + match = iter; } if (iter->fib6_metric > rt->fib6_metric) break; -next_iter: ins = &iter->fib6_next; } - if (fallback_ins && !found) { - /* No ECMP-able route found, replace first non-ECMP one */ - ins = fallback_ins; - iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->fib6_table->tb6_lock)); - found++; - } - /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ - if (rt->fib6_nsiblings) { - unsigned int fib6_nsiblings; + if (append && match) { struct fib6_info *sibling, *temp_sibling; - /* Find the first route that have the same metric */ - sibling = leaf; - while (sibling) { - if (sibling->fib6_metric == rt->fib6_metric && - rt6_qualify_for_ecmp(sibling)) { - list_add_tail(&rt->fib6_siblings, - &sibling->fib6_siblings); - break; - } - sibling = rcu_dereference_protected(sibling->fib6_next, - lockdep_is_held(&rt->fib6_table->tb6_lock)); + if (rt->fib6_flags & RTF_REJECT) { + NL_SET_ERR_MSG(extack, + "Can not append a REJECT route"); + return -EINVAL; + } else if (match->fib6_flags & RTF_REJECT) { + NL_SET_ERR_MSG(extack, + "Can not append to a REJECT route"); + return -EINVAL; } + rt->fib6_nsiblings = match->fib6_nsiblings; + list_add_tail(&rt->fib6_siblings, &match->fib6_siblings); + match->fib6_nsiblings++; + /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ - fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, - &rt->fib6_siblings, fib6_siblings) { + &match->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; - BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); - fib6_nsiblings++; + BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings); } - BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); - rt6_multipath_rebalance(temp_sibling); + + rt6_multipath_rebalance(match); } /* * insert node */ if (!replace) { + enum fib_event_type event; + if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_ADD, - rt, extack); + event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD; + err = call_fib6_entry_notifiers(info->nl_net, event, rt, + extack); if (err) return err; @@ -1086,7 +1062,7 @@ add: } } else { - int nsiblings; + struct fib6_info *tmp; if (!found) { if (add) @@ -1101,48 +1077,57 @@ add: if (err) return err; + /* if route being replaced has siblings, set tmp to + * last one, otherwise tmp is current route. this is + * used to set fib6_next for new route + */ + if (iter->fib6_nsiblings) + tmp = list_last_entry(&iter->fib6_siblings, + struct fib6_info, + fib6_siblings); + else + tmp = iter; + + /* insert new route */ atomic_inc(&rt->fib6_ref); rcu_assign_pointer(rt->fib6_node, fn); - rt->fib6_next = iter->fib6_next; + rt->fib6_next = tmp->fib6_next; rcu_assign_pointer(*ins, rt); + if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } - nsiblings = iter->fib6_nsiblings; - iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == iter) - fn->rr_ptr = NULL; - fib6_info_release(iter); - if (nsiblings) { + /* delete old route */ + rt = iter; + + if (rt->fib6_nsiblings) { + struct fib6_info *tmp; + /* Replacing an ECMP route, remove all siblings */ - ins = &rt->fib6_next; - iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->fib6_table->tb6_lock)); - while (iter) { - if (iter->fib6_metric > rt->fib6_metric) - break; - if (rt6_qualify_for_ecmp(iter)) { - *ins = iter->fib6_next; - iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == iter) - fn->rr_ptr = NULL; - fib6_info_release(iter); - nsiblings--; - info->nl_net->ipv6.rt6_stats->fib_rt_entries--; - } else { - ins = &iter->fib6_next; - } - iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->fib6_table->tb6_lock)); + list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings, + fib6_siblings) { + iter->fib6_node = NULL; + fib6_purge_rt(iter, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + fib6_info_release(iter); + + rt->fib6_nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } - WARN_ON(nsiblings != 0); } + + WARN_ON(rt->fib6_nsiblings != 0); + + rt->fib6_node = NULL; + fib6_purge_rt(rt, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == rt) + fn->rr_ptr = NULL; + fib6_info_release(rt); } return 0; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index bede77f24784..c8cf2fdbb13b 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -71,6 +71,7 @@ struct ip6gre_net { struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE]; struct ip6_tnl __rcu *collect_md_tun; + struct ip6_tnl __rcu *collect_md_tun_erspan; struct net_device *fb_tunnel_dev; }; @@ -81,6 +82,7 @@ static int ip6gre_tunnel_init(struct net_device *dev); static void ip6gre_tunnel_setup(struct net_device *dev); static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t); static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu); +static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu); /* Tunnel hash table */ @@ -232,7 +234,12 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, if (cand) return cand; - t = rcu_dereference(ign->collect_md_tun); + if (gre_proto == htons(ETH_P_ERSPAN) || + gre_proto == htons(ETH_P_ERSPAN2)) + t = rcu_dereference(ign->collect_md_tun_erspan); + else + t = rcu_dereference(ign->collect_md_tun); + if (t && t->dev->flags & IFF_UP) return t; @@ -261,6 +268,31 @@ static struct ip6_tnl __rcu **__ip6gre_bucket(struct ip6gre_net *ign, return &ign->tunnels[prio][h]; } +static void ip6gre_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t) +{ + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, t); +} + +static void ip6erspan_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t) +{ + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun_erspan, t); +} + +static void ip6gre_tunnel_unlink_md(struct ip6gre_net *ign, struct ip6_tnl *t) +{ + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun, NULL); +} + +static void ip6erspan_tunnel_unlink_md(struct ip6gre_net *ign, + struct ip6_tnl *t) +{ + if (t->parms.collect_md) + rcu_assign_pointer(ign->collect_md_tun_erspan, NULL); +} + static inline struct ip6_tnl __rcu **ip6gre_bucket(struct ip6gre_net *ign, const struct ip6_tnl *t) { @@ -271,9 +303,6 @@ static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t); - if (t->parms.collect_md) - rcu_assign_pointer(ign->collect_md_tun, t); - rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } @@ -283,9 +312,6 @@ static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t) struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; - if (t->parms.collect_md) - rcu_assign_pointer(ign->collect_md_tun, NULL); - for (tp = ip6gre_bucket(ign, t); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { @@ -374,11 +400,23 @@ failed_free: return NULL; } +static void ip6erspan_tunnel_uninit(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); + + ip6erspan_tunnel_unlink_md(ign, t); + ip6gre_tunnel_unlink(ign, t); + dst_cache_reset(&t->dst_cache); + dev_put(dev); +} + static void ip6gre_tunnel_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); + ip6gre_tunnel_unlink_md(ign, t); ip6gre_tunnel_unlink(ign, t); dst_cache_reset(&t->dst_cache); dev_put(dev); @@ -698,6 +736,9 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, else fl6->daddr = tunnel->parms.raddr; + if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) + return -ENOMEM; + /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; @@ -920,7 +961,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)) truncate = true; - if (skb_cow_head(skb, dev->needed_headroom)) + if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen)) goto tx_err; t->parms.o_flags &= ~TUNNEL_KEY; @@ -991,11 +1032,14 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, erspan_build_header(skb, ntohl(t->parms.o_key), t->parms.index, truncate, false); - else + else if (t->parms.erspan_ver == 2) erspan_build_header_v2(skb, ntohl(t->parms.o_key), t->parms.dir, t->parms.hwid, truncate, false); + else + goto tx_err; + fl6.daddr = t->parms.raddr; } @@ -1031,12 +1075,11 @@ tx_err: return NETDEV_TX_OK; } -static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) +static void ip6gre_tnl_link_config_common(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; - int t_hlen; if (dev->type != ARPHRD_ETHER) { memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); @@ -1063,12 +1106,13 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; +} - t->tun_hlen = gre_calc_hlen(t->parms.o_flags); - - t->hlen = t->encap_hlen + t->tun_hlen; - - t_hlen = t->hlen + sizeof(struct ipv6hdr); +static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu, + int t_hlen) +{ + const struct __ip6_tnl_parm *p = &t->parms; + struct net_device *dev = t->dev; if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & @@ -1100,8 +1144,26 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) } } -static int ip6gre_tnl_change(struct ip6_tnl *t, - const struct __ip6_tnl_parm *p, int set_mtu) +static int ip6gre_calc_hlen(struct ip6_tnl *tunnel) +{ + int t_hlen; + + tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + + t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); + tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen; + return t_hlen; +} + +static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) +{ + ip6gre_tnl_link_config_common(t); + ip6gre_tnl_link_config_route(t, set_mtu, ip6gre_calc_hlen(t)); +} + +static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t, + const struct __ip6_tnl_parm *p) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; @@ -1117,6 +1179,12 @@ static int ip6gre_tnl_change(struct ip6_tnl *t, t->parms.o_flags = p->o_flags; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); +} + +static int ip6gre_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p, + int set_mtu) +{ + ip6gre_tnl_copy_tnl_parm(t, p); ip6gre_tnl_link_config(t, set_mtu); return 0; } @@ -1395,11 +1463,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (ret) goto cleanup_dst_cache_init; - tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); - tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; - t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); - - dev->hard_header_len = LL_MAX_HEADER + t_hlen; + t_hlen = ip6gre_calc_hlen(tunnel); dev->mtu = ETH_DATA_LEN - t_hlen; if (dev->type == ARPHRD_ETHER) dev->mtu -= ETH_HLEN; @@ -1749,6 +1813,19 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_get_iflink = ip6_tnl_get_iflink, }; +static int ip6erspan_calc_hlen(struct ip6_tnl *tunnel) +{ + int t_hlen; + + tunnel->tun_hlen = 8; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + erspan_hdr_len(tunnel->parms.erspan_ver); + + t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); + tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen; + return t_hlen; +} + static int ip6erspan_tap_init(struct net_device *dev) { struct ip6_tnl *tunnel; @@ -1773,12 +1850,7 @@ static int ip6erspan_tap_init(struct net_device *dev) if (ret) goto cleanup_dst_cache_init; - tunnel->tun_hlen = 8; - tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + - erspan_hdr_len(tunnel->parms.erspan_ver); - t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); - - dev->hard_header_len = LL_MAX_HEADER + t_hlen; + t_hlen = ip6erspan_calc_hlen(tunnel); dev->mtu = ETH_DATA_LEN - t_hlen; if (dev->type == ARPHRD_ETHER) dev->mtu -= ETH_HLEN; @@ -1786,7 +1858,7 @@ static int ip6erspan_tap_init(struct net_device *dev) dev->mtu -= 8; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - ip6gre_tnl_link_config(tunnel, 1); + ip6erspan_tnl_link_config(tunnel, 1); return 0; @@ -1800,7 +1872,7 @@ cleanup_alloc_pcpu_stats: static const struct net_device_ops ip6erspan_netdev_ops = { .ndo_init = ip6erspan_tap_init, - .ndo_uninit = ip6gre_tunnel_uninit, + .ndo_uninit = ip6erspan_tunnel_uninit, .ndo_start_xmit = ip6erspan_tunnel_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, @@ -1864,13 +1936,11 @@ static bool ip6gre_netlink_encap_parms(struct nlattr *data[], return ret; } -static int ip6gre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) +static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { struct ip6_tnl *nt; - struct net *net = dev_net(dev); - struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct ip_tunnel_encap ipencap; int err; @@ -1883,16 +1953,6 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, return err; } - ip6gre_netlink_parms(data, &nt->parms); - - if (nt->parms.collect_md) { - if (rtnl_dereference(ign->collect_md_tun)) - return -EEXIST; - } else { - if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) - return -EEXIST; - } - if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); @@ -1903,51 +1963,94 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, if (err) goto out; - ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); - if (tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); dev_hold(dev); - ip6gre_tunnel_link(ign, nt); out: return err; } -static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack) +static int ip6gre_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip6_tnl *nt = netdev_priv(dev); + struct net *net = dev_net(dev); + struct ip6gre_net *ign; + int err; + + ip6gre_netlink_parms(data, &nt->parms); + ign = net_generic(net, ip6gre_net_id); + + if (nt->parms.collect_md) { + if (rtnl_dereference(ign->collect_md_tun)) + return -EEXIST; + } else { + if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) + return -EEXIST; + } + + err = ip6gre_newlink_common(src_net, dev, tb, data, extack); + if (!err) { + ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); + ip6gre_tunnel_link_md(ign, nt); + ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt); + } + return err; +} + +static struct ip6_tnl * +ip6gre_changelink_common(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], struct __ip6_tnl_parm *p_p, + struct netlink_ext_ack *extack) { struct ip6_tnl *t, *nt = netdev_priv(dev); struct net *net = nt->net; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - struct __ip6_tnl_parm p; struct ip_tunnel_encap ipencap; if (dev == ign->fb_tunnel_dev) - return -EINVAL; + return ERR_PTR(-EINVAL); if (ip6gre_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) - return err; + return ERR_PTR(err); } - ip6gre_netlink_parms(data, &p); + ip6gre_netlink_parms(data, p_p); - t = ip6gre_tunnel_locate(net, &p, 0); + t = ip6gre_tunnel_locate(net, p_p, 0); if (t) { if (t->dev != dev) - return -EEXIST; + return ERR_PTR(-EEXIST); } else { t = nt; } + return t; +} + +static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id); + struct __ip6_tnl_parm p; + struct ip6_tnl *t; + + t = ip6gre_changelink_common(dev, tb, data, &p, extack); + if (IS_ERR(t)) + return PTR_ERR(t); + + ip6gre_tunnel_unlink_md(ign, t); ip6gre_tunnel_unlink(ign, t); ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]); + ip6gre_tunnel_link_md(ign, t); ip6gre_tunnel_link(ign, t); return 0; } @@ -2097,6 +2200,69 @@ static void ip6erspan_tap_setup(struct net_device *dev) netif_keep_dst(dev); } +static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip6_tnl *nt = netdev_priv(dev); + struct net *net = dev_net(dev); + struct ip6gre_net *ign; + int err; + + ip6gre_netlink_parms(data, &nt->parms); + ign = net_generic(net, ip6gre_net_id); + + if (nt->parms.collect_md) { + if (rtnl_dereference(ign->collect_md_tun_erspan)) + return -EEXIST; + } else { + if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) + return -EEXIST; + } + + err = ip6gre_newlink_common(src_net, dev, tb, data, extack); + if (!err) { + ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]); + ip6erspan_tunnel_link_md(ign, nt); + ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt); + } + return err; +} + +static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu) +{ + ip6gre_tnl_link_config_common(t); + ip6gre_tnl_link_config_route(t, set_mtu, ip6erspan_calc_hlen(t)); +} + +static int ip6erspan_tnl_change(struct ip6_tnl *t, + const struct __ip6_tnl_parm *p, int set_mtu) +{ + ip6gre_tnl_copy_tnl_parm(t, p); + ip6erspan_tnl_link_config(t, set_mtu); + return 0; +} + +static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id); + struct __ip6_tnl_parm p; + struct ip6_tnl *t; + + t = ip6gre_changelink_common(dev, tb, data, &p, extack); + if (IS_ERR(t)) + return PTR_ERR(t); + + ip6gre_tunnel_unlink_md(ign, t); + ip6gre_tunnel_unlink(ign, t); + ip6erspan_tnl_change(t, &p, !tb[IFLA_MTU]); + ip6erspan_tunnel_link_md(ign, t); + ip6gre_tunnel_link(ign, t); + return 0; +} + static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { .kind = "ip6gre", .maxtype = IFLA_GRE_MAX, @@ -2133,8 +2299,8 @@ static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = { .priv_size = sizeof(struct ip6_tnl), .setup = ip6erspan_tap_setup, .validate = ip6erspan_tap_validate, - .newlink = ip6gre_newlink, - .changelink = ip6gre_changelink, + .newlink = ip6erspan_newlink, + .changelink = ip6erspan_changelink, .get_size = ip6gre_get_size, .fill_info = ip6gre_fill_info, .get_link_net = ip6_tnl_get_link_net, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7f4493080df6..60b0d1652448 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1494,7 +1494,8 @@ alloc_new_skb: if (copy > length) copy = length; - if (!(rt->dst.dev->features&NETIF_F_SG)) { + if (!(rt->dst.dev->features&NETIF_F_SG) && + skb_tailroom(skb) >= copy) { unsigned int off; off = skb->len; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7097bbf95843..0758b5bcfb29 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -38,6 +38,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv6 packet filter"); +MODULE_ALIAS("ip6t_icmp6"); void *ip6t_alloc_initial_table(const struct xt_table *info) { @@ -1792,6 +1793,8 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, /* set res now, will see skbs right after nf_register_net_hooks */ WRITE_ONCE(*res, new_table); + if (!ops) + return 0; ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); if (ret != 0) { @@ -1809,7 +1812,8 @@ out_free: void ip6t_unregister_table(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops) { - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ops) + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); __ip6t_unregister_table(net, table); } diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index d12f511929f5..0fe61ede77c6 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -48,6 +48,8 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, } fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; + if ((flags & XT_RPFILTER_LOOSE) == 0) + fl6.flowi6_oif = dev->ifindex; rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags); if (rt->dst.error) diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 2bf554e18af8..67ba70ab9f5c 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -40,69 +40,58 @@ static unsigned int ip6table_nat_do_chain(void *priv, return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat); } -static unsigned int ip6table_nat_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv6_fn(priv, skb, state, ip6table_nat_do_chain); -} - -static unsigned int ip6table_nat_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv6_in(priv, skb, state, ip6table_nat_do_chain); -} - -static unsigned int ip6table_nat_out(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv6_out(priv, skb, state, ip6table_nat_do_chain); -} - -static unsigned int ip6table_nat_local_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv6_local_fn(priv, skb, state, ip6table_nat_do_chain); -} - static const struct nf_hook_ops nf_nat_ipv6_ops[] = { - /* Before packet filtering, change destination */ { - .hook = ip6table_nat_in, + .hook = ip6table_nat_do_chain, .pf = NFPROTO_IPV6, - .nat_hook = true, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, }, - /* After packet filtering, change source */ { - .hook = ip6table_nat_out, + .hook = ip6table_nat_do_chain, .pf = NFPROTO_IPV6, - .nat_hook = true, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, }, - /* Before packet filtering, change destination */ { - .hook = ip6table_nat_local_fn, + .hook = ip6table_nat_do_chain, .pf = NFPROTO_IPV6, - .nat_hook = true, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, }, - /* After packet filtering, change source */ { - .hook = ip6table_nat_fn, - .nat_hook = true, + .hook = ip6table_nat_do_chain, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, }, }; +static int ip6t_nat_register_lookups(struct net *net) +{ + int i, ret; + + for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) { + ret = nf_nat_l3proto_ipv6_register_fn(net, &nf_nat_ipv6_ops[i]); + if (ret) { + while (i) + nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[--i]); + + return ret; + } + } + + return 0; +} + +static void ip6t_nat_unregister_lookups(struct net *net) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) + nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[i]); +} + static int __net_init ip6table_nat_table_init(struct net *net) { struct ip6t_replace *repl; @@ -115,7 +104,17 @@ static int __net_init ip6table_nat_table_init(struct net *net) if (repl == NULL) return -ENOMEM; ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl, - nf_nat_ipv6_ops, &net->ipv6.ip6table_nat); + NULL, &net->ipv6.ip6table_nat); + if (ret < 0) { + kfree(repl); + return ret; + } + + ret = ip6t_nat_register_lookups(net); + if (ret < 0) { + ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL); + net->ipv6.ip6table_nat = NULL; + } kfree(repl); return ret; } @@ -124,7 +123,8 @@ static void __net_exit ip6table_nat_net_exit(struct net *net) { if (!net->ipv6.ip6table_nat) return; - ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops); + ip6t_nat_unregister_lookups(net); + ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL); net->ipv6.ip6table_nat = NULL; } diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index f1582b6f9588..ca6d38698b1a 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -252,17 +252,12 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); -unsigned int +static unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)) + const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; - struct nf_conn_nat *nat; - enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); __be16 frag_off; int hdrlen; u8 nexthdr; @@ -276,11 +271,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, if (!ct) return NF_ACCEPT; - nat = nfct_nat(ct); - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: + if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { nexthdr = ipv6_hdr(skb)->nexthdr; hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); @@ -293,75 +284,29 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, else return NF_ACCEPT; } - /* Only ICMPs can be IP_CT_IS_REPLY: */ - /* fall through */ - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - unsigned int ret; - - ret = do_chain(priv, skb, state); - if (ret != NF_ACCEPT) - return ret; - - if (nf_nat_initialized(ct, HOOK2MANIP(state->hook))) - break; - - ret = nf_nat_alloc_null_binding(ct, state->hook); - if (ret != NF_ACCEPT) - return ret; - } else { - pr_debug("Already setup manip %s for ct %p\n", - maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", - ct); - if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) - goto oif_changed; - } - break; - - default: - /* ESTABLISHED */ - WARN_ON(ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) - goto oif_changed; } - return nf_nat_packet(ct, ctinfo, state->hook, skb); - -oif_changed: - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_DROP; + return nf_nat_inet_fn(priv, skb, state); } -EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn); -unsigned int +static unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)) + const struct nf_hook_state *state) { unsigned int ret; struct in6_addr daddr = ipv6_hdr(skb)->daddr; - ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); + ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_DROP && ret != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); return ret; } -EXPORT_SYMBOL_GPL(nf_nat_ipv6_in); -unsigned int +static unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)) + const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; @@ -370,7 +315,7 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb, #endif unsigned int ret; - ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); + ret = nf_nat_ipv6_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && @@ -390,21 +335,17 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb, #endif return ret; } -EXPORT_SYMBOL_GPL(nf_nat_ipv6_out); -unsigned int +static unsigned int nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)) + const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; - ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); + ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -428,7 +369,49 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, } return ret; } -EXPORT_SYMBOL_GPL(nf_nat_ipv6_local_fn); + +static const struct nf_hook_ops nf_nat_ipv6_ops[] = { + /* Before packet filtering, change destination */ + { + .hook = nf_nat_ipv6_in, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_ipv6_out, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_NAT_SRC, + }, + /* Before packet filtering, change destination */ + { + .hook = nf_nat_ipv6_local_fn, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_ipv6_fn, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC, + }, +}; + +int nf_nat_l3proto_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops) +{ + return nf_nat_register_fn(net, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); +} +EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_register_fn); + +void nf_nat_l3proto_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops) +{ + nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); +} +EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_unregister_fn); static int __init nf_nat_l3proto_ipv6_init(void) { diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 100a6bd1046a..8a081ad7d5db 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -36,42 +36,14 @@ static unsigned int nft_nat_do_chain(void *priv, return nft_do_chain(&pkt, priv); } -static unsigned int nft_nat_ipv6_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) +static int nft_nat_ipv6_reg(struct net *net, const struct nf_hook_ops *ops) { - return nf_nat_ipv6_fn(priv, skb, state, nft_nat_do_chain); + return nf_nat_l3proto_ipv6_register_fn(net, ops); } -static unsigned int nft_nat_ipv6_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) +static void nft_nat_ipv6_unreg(struct net *net, const struct nf_hook_ops *ops) { - return nf_nat_ipv6_in(priv, skb, state, nft_nat_do_chain); -} - -static unsigned int nft_nat_ipv6_out(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv6_out(priv, skb, state, nft_nat_do_chain); -} - -static unsigned int nft_nat_ipv6_local_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain); -} - -static int nft_nat_ipv6_init(struct nft_ctx *ctx) -{ - return nf_ct_netns_get(ctx->net, ctx->family); -} - -static void nft_nat_ipv6_free(struct nft_ctx *ctx) -{ - nf_ct_netns_put(ctx->net, ctx->family); + nf_nat_l3proto_ipv6_unregister_fn(net, ops); } static const struct nft_chain_type nft_chain_nat_ipv6 = { @@ -84,13 +56,13 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = { (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .hooks = { - [NF_INET_PRE_ROUTING] = nft_nat_ipv6_in, - [NF_INET_POST_ROUTING] = nft_nat_ipv6_out, - [NF_INET_LOCAL_OUT] = nft_nat_ipv6_local_fn, - [NF_INET_LOCAL_IN] = nft_nat_ipv6_fn, + [NF_INET_PRE_ROUTING] = nft_nat_do_chain, + [NF_INET_POST_ROUTING] = nft_nat_do_chain, + [NF_INET_LOCAL_OUT] = nft_nat_do_chain, + [NF_INET_LOCAL_IN] = nft_nat_do_chain, }, - .init = nft_nat_ipv6_init, - .free = nft_nat_ipv6_free, + .ops_register = nft_nat_ipv6_reg, + .ops_unregister = nft_nat_ipv6_unreg, }; static int __init nft_chain_nat_ipv6_init(void) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dc5d5c84dbef..0a35ded448a6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -63,6 +63,7 @@ #include <net/lwtunnel.h> #include <net/ip_tunnels.h> #include <net/l3mdev.h> +#include <net/ip.h> #include <trace/events/fib6.h> #include <linux/uaccess.h> @@ -3839,7 +3840,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric == rt->fib6_metric && - rt6_qualify_for_ecmp(iter)) + iter->fib6_nsiblings) return iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -4131,6 +4132,9 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, + [RTA_IP_PROTO] = { .type = NLA_U8 }, + [RTA_SPORT] = { .type = NLA_U16 }, + [RTA_DPORT] = { .type = NLA_U16 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -4429,6 +4433,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); + cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND; nhn++; } @@ -4842,6 +4847,19 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, else fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); + if (tb[RTA_SPORT]) + fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); + + if (tb[RTA_DPORT]) + fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); + + if (tb[RTA_IP_PROTO]) { + err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], + &fl6.flowi6_proto, extack); + if (err) + goto errout; + } + if (iif) { struct net_device *dev; int flags = 0; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2839c1bd1e58..426c9d2b418d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1053,7 +1053,8 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, return -EINVAL; if (udp_sk(sk)->no_check6_tx) return -EINVAL; - if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite) + if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || + dst_xfrm(skb_dst(skb))) return -EIO; skb_shinfo(skb)->gso_size = cork->gso_size; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 85dbaa891059..bdf6fa78d0d2 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -695,7 +695,7 @@ static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev, if (sta) { ret = 0; memcpy(mac, sta->sta.addr, ETH_ALEN); - sta_set_sinfo(sta, sinfo); + sta_set_sinfo(sta, sinfo, true); } mutex_unlock(&local->sta_mtx); @@ -724,7 +724,7 @@ static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, sta = sta_info_get_bss(sdata, mac); if (sta) { ret = 0; - sta_set_sinfo(sta, sinfo); + sta_set_sinfo(sta, sinfo, true); } mutex_unlock(&local->sta_mtx); @@ -2376,6 +2376,11 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS); + if (changed & (WIPHY_PARAM_TXQ_LIMIT | + WIPHY_PARAM_TXQ_MEMORY_LIMIT | + WIPHY_PARAM_TXQ_QUANTUM)) + ieee80211_txq_set_params(local); + return 0; } @@ -3705,6 +3710,99 @@ static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy, return 0; } +void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, + struct txq_info *txqi) +{ + if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_BYTES))) { + txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_BYTES); + txqstats->backlog_bytes = txqi->tin.backlog_bytes; + } + + if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS))) { + txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS); + txqstats->backlog_packets = txqi->tin.backlog_packets; + } + + if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_FLOWS))) { + txqstats->filled |= BIT(NL80211_TXQ_STATS_FLOWS); + txqstats->flows = txqi->tin.flows; + } + + if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_DROPS))) { + txqstats->filled |= BIT(NL80211_TXQ_STATS_DROPS); + txqstats->drops = txqi->cstats.drop_count; + } + + if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_ECN_MARKS))) { + txqstats->filled |= BIT(NL80211_TXQ_STATS_ECN_MARKS); + txqstats->ecn_marks = txqi->cstats.ecn_mark; + } + + if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_OVERLIMIT))) { + txqstats->filled |= BIT(NL80211_TXQ_STATS_OVERLIMIT); + txqstats->overlimit = txqi->tin.overlimit; + } + + if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_COLLISIONS))) { + txqstats->filled |= BIT(NL80211_TXQ_STATS_COLLISIONS); + txqstats->collisions = txqi->tin.collisions; + } + + if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_BYTES))) { + txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_BYTES); + txqstats->tx_bytes = txqi->tin.tx_bytes; + } + + if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_PACKETS))) { + txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_PACKETS); + txqstats->tx_packets = txqi->tin.tx_packets; + } +} + +static int ieee80211_get_txq_stats(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_txq_stats *txqstats) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + struct ieee80211_sub_if_data *sdata; + int ret = 0; + + if (!local->ops->wake_tx_queue) + return 1; + + spin_lock_bh(&local->fq.lock); + rcu_read_lock(); + + if (wdev) { + sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + if (!sdata->vif.txq) { + ret = 1; + goto out; + } + ieee80211_fill_txq_stats(txqstats, to_txq_info(sdata->vif.txq)); + } else { + /* phy stats */ + txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS) | + BIT(NL80211_TXQ_STATS_BACKLOG_BYTES) | + BIT(NL80211_TXQ_STATS_OVERLIMIT) | + BIT(NL80211_TXQ_STATS_OVERMEMORY) | + BIT(NL80211_TXQ_STATS_COLLISIONS) | + BIT(NL80211_TXQ_STATS_MAX_FLOWS); + txqstats->backlog_packets = local->fq.backlog; + txqstats->backlog_bytes = local->fq.memory_usage; + txqstats->overlimit = local->fq.overlimit; + txqstats->overmemory = local->fq.overmemory; + txqstats->collisions = local->fq.collisions; + txqstats->max_flows = local->fq.flows_cnt; + } + +out: + rcu_read_unlock(); + spin_unlock_bh(&local->fq.lock); + + return ret; +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -3798,4 +3896,5 @@ const struct cfg80211_ops mac80211_config_ops = { .del_nan_func = ieee80211_del_nan_func, .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, .tx_control_port = ieee80211_tx_control_port, + .get_txq_stats = ieee80211_get_txq_stats, }; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 4d82fe7d627c..8f6998091d26 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -2,6 +2,7 @@ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH +* Copyright (C) 2018 Intel Corporation */ #ifndef __MAC80211_DRIVER_OPS @@ -813,7 +814,8 @@ drv_allow_buffered_frames(struct ieee80211_local *local, } static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) + struct ieee80211_sub_if_data *sdata, + u16 duration) { might_sleep(); @@ -821,9 +823,9 @@ static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); - trace_drv_mgd_prepare_tx(local, sdata); + trace_drv_mgd_prepare_tx(local, sdata, duration); if (local->ops->mgd_prepare_tx) - local->ops->mgd_prepare_tx(&local->hw, &sdata->vif); + local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, duration); trace_drv_return_void(local); } diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 9cc986deda61..690c142a7a44 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -4,6 +4,7 @@ * Copied from cfg.c - originally * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2014 Intel Corporation (Author: Johannes Berg) + * Copyright (C) 2018 Intel Corporation * * This file is GPLv2 as found in COPYING. */ @@ -106,8 +107,8 @@ static void ieee80211_get_stats(struct net_device *dev, if (!(sta && !WARN_ON(sta->sdata->dev != dev))) goto do_survey; - sinfo.filled = 0; - sta_set_sinfo(sta, &sinfo); + memset(&sinfo, 0, sizeof(sinfo)); + sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(sta); @@ -116,11 +117,11 @@ static void ieee80211_get_stats(struct net_device *dev, if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) - data[i] = 100000 * + data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.txrate); i++; if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE)) - data[i] = 100000 * + data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.rxrate); i++; @@ -133,8 +134,8 @@ static void ieee80211_get_stats(struct net_device *dev, if (sta->sdata->dev != dev) continue; - sinfo.filled = 0; - sta_set_sinfo(sta, &sinfo); + memset(&sinfo, 0, sizeof(sinfo)); + sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(sta); } diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index c78036a0ac94..26a7ba3b698f 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -301,26 +301,27 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, ___ieee80211_stop_tx_ba_session(sta, i, reason); mutex_unlock(&sta->ampdu_mlme.mtx); - /* stopping might queue the work again - so cancel only afterwards */ - cancel_work_sync(&sta->ampdu_mlme.work); - /* * In case the tear down is part of a reconfigure due to HW restart * request, it is possible that the low level driver requested to stop * the BA session, so handle it to properly clean tid_tx data. */ - mutex_lock(&sta->ampdu_mlme.mtx); - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { - struct tid_ampdu_tx *tid_tx = - rcu_dereference_protected_tid_tx(sta, i); + if(reason == AGG_STOP_DESTROY_STA) { + cancel_work_sync(&sta->ampdu_mlme.work); - if (!tid_tx) - continue; + mutex_lock(&sta->ampdu_mlme.mtx); + for (i = 0; i < IEEE80211_NUM_TIDS; i++) { + struct tid_ampdu_tx *tid_tx = + rcu_dereference_protected_tid_tx(sta, i); - if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state)) - ieee80211_stop_tx_ba_cb(sta, i, tid_tx); + if (!tid_tx) + continue; + + if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state)) + ieee80211_stop_tx_ba_cb(sta, i, tid_tx); + } + mutex_unlock(&sta->ampdu_mlme.mtx); } - mutex_unlock(&sta->ampdu_mlme.mtx); } void ieee80211_ba_session_work(struct work_struct *work) @@ -328,16 +329,11 @@ void ieee80211_ba_session_work(struct work_struct *work) struct sta_info *sta = container_of(work, struct sta_info, ampdu_mlme.work); struct tid_ampdu_tx *tid_tx; + bool blocked; int tid; - /* - * When this flag is set, new sessions should be - * blocked, and existing sessions will be torn - * down by the code that set the flag, so this - * need not run. - */ - if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) - return; + /* When this flag is set, new sessions should be blocked. */ + blocked = test_sta_flag(sta, WLAN_STA_BLOCK_BA); mutex_lock(&sta->ampdu_mlme.mtx); for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { @@ -352,7 +348,8 @@ void ieee80211_ba_session_work(struct work_struct *work) sta, tid, WLAN_BACK_RECIPIENT, WLAN_REASON_UNSPECIFIED, true); - if (test_and_clear_bit(tid, + if (!blocked && + test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl)) ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, IEEE80211_MAX_AMPDU_BUF, @@ -367,7 +364,7 @@ void ieee80211_ba_session_work(struct work_struct *work) spin_lock_bh(&sta->lock); tid_tx = sta->ampdu_mlme.tid_start_tx[tid]; - if (tid_tx) { + if (!blocked && tid_tx) { /* * Assign it over to the normal tid_tx array * where it "goes live". @@ -390,7 +387,8 @@ void ieee80211_ba_session_work(struct work_struct *work) if (!tid_tx) continue; - if (test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state)) + if (!blocked && + test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state)) ieee80211_start_tx_ba_cb(sta, tid, tid_tx); if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state)) ___ieee80211_stop_tx_ba_session(sta, tid, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6372dbdadf53..d1978aa1c15d 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2012,6 +2012,7 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local) } int ieee80211_txq_setup_flows(struct ieee80211_local *local); +void ieee80211_txq_set_params(struct ieee80211_local *local); void ieee80211_txq_teardown_flows(struct ieee80211_local *local); void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, @@ -2020,6 +2021,8 @@ void ieee80211_txq_purge(struct ieee80211_local *local, struct txq_info *txqi); void ieee80211_txq_remove_vlan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); +void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats, + struct txq_info *txqi); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 9ea17afaa237..4d2e797e3f16 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -565,6 +565,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, if (!ops->set_key) wiphy->flags |= WIPHY_FLAG_IBSS_RSN; + if (ops->wake_tx_queue) + wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS); + wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM); wiphy->bss_priv_size = sizeof(struct ieee80211_bss); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 233068756502..a59187c016e0 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -864,7 +864,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) return; } - drv_mgd_prepare_tx(local, sdata); + drv_mgd_prepare_tx(local, sdata, 0); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) @@ -2022,7 +2022,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, */ if (ieee80211_hw_check(&local->hw, DEAUTH_NEED_MGD_TX_PREP) && !ifmgd->have_beacon) - drv_mgd_prepare_tx(sdata->local, sdata); + drv_mgd_prepare_tx(sdata->local, sdata, 0); ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype, reason, tx, frame_buf); @@ -2560,7 +2560,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, if (!elems.challenge) return; auth_data->expected_transaction = 4; - drv_mgd_prepare_tx(sdata->local, sdata); + drv_mgd_prepare_tx(sdata->local, sdata, 0); if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; @@ -3769,6 +3769,7 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) u32 tx_flags = 0; u16 trans = 1; u16 status = 0; + u16 prepare_tx_duration = 0; sdata_assert_lock(sdata); @@ -3790,7 +3791,11 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) return -ETIMEDOUT; } - drv_mgd_prepare_tx(local, sdata); + if (auth_data->algorithm == WLAN_AUTH_SAE) + prepare_tx_duration = + jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE); + + drv_mgd_prepare_tx(local, sdata, prepare_tx_duration); sdata_info(sdata, "send auth to %pM (try %d/%d)\n", auth_data->bss->bssid, auth_data->tries, @@ -4994,7 +4999,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); - drv_mgd_prepare_tx(sdata->local, sdata); + drv_mgd_prepare_tx(sdata->local, sdata, 0); ieee80211_send_deauth_disassoc(sdata, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, @@ -5014,7 +5019,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); - drv_mgd_prepare_tx(sdata->local, sdata); + drv_mgd_prepare_tx(sdata->local, sdata, 0); ieee80211_send_deauth_disassoc(sdata, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 03102aff0953..0a38cc1cbebc 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -5,6 +5,7 @@ * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015 - 2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -97,27 +98,27 @@ static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, */ static void remove_monitor_info(struct sk_buff *skb, unsigned int present_fcs_len, - unsigned int rtap_vendor_space) + unsigned int rtap_space) { if (present_fcs_len) __pskb_trim(skb, skb->len - present_fcs_len); - __pskb_pull(skb, rtap_vendor_space); + __pskb_pull(skb, rtap_space); } static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len, - unsigned int rtap_vendor_space) + unsigned int rtap_space) { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_hdr *hdr; - hdr = (void *)(skb->data + rtap_vendor_space); + hdr = (void *)(skb->data + rtap_space); if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC | RX_FLAG_ONLY_MONITOR)) return true; - if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space)) + if (unlikely(skb->len < 16 + present_fcs_len + rtap_space)) return true; if (ieee80211_is_ctl(hdr->frame_control) && @@ -199,7 +200,7 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, - int rtap_vendor_space) + int rtap_space) { struct { struct ieee80211_hdr_3addr hdr; @@ -212,14 +213,14 @@ static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1); - if (skb->len < rtap_vendor_space + sizeof(action) + + if (skb->len < rtap_space + sizeof(action) + VHT_MUMIMO_GROUPS_DATA_LEN) return; if (!is_valid_ether_addr(sdata->u.mntr.mu_follow_addr)) return; - skb_copy_bits(skb, rtap_vendor_space, &action, sizeof(action)); + skb_copy_bits(skb, rtap_space, &action, sizeof(action)); if (!ieee80211_is_action(action.hdr.frame_control)) return; @@ -545,7 +546,7 @@ static struct sk_buff * ieee80211_make_monitor_skb(struct ieee80211_local *local, struct sk_buff **origskb, struct ieee80211_rate *rate, - int rtap_vendor_space, bool use_origskb) + int rtap_space, bool use_origskb) { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(*origskb); int rt_hdrlen, needed_headroom; @@ -553,7 +554,7 @@ ieee80211_make_monitor_skb(struct ieee80211_local *local, /* room for the radiotap header based on driver features */ rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, *origskb); - needed_headroom = rt_hdrlen - rtap_vendor_space; + needed_headroom = rt_hdrlen - rtap_space; if (use_origskb) { /* only need to expand headroom if necessary */ @@ -607,7 +608,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, struct ieee80211_sub_if_data *sdata; struct sk_buff *monskb = NULL; int present_fcs_len = 0; - unsigned int rtap_vendor_space = 0; + unsigned int rtap_space = 0; struct ieee80211_sub_if_data *monitor_sdata = rcu_dereference(local->monitor_sdata); bool only_monitor = false; @@ -615,7 +616,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) { struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data; - rtap_vendor_space = sizeof(*rtap) + rtap->len + rtap->pad; + rtap_space += sizeof(*rtap) + rtap->len + rtap->pad; } /* @@ -638,13 +639,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, } /* ensure hdr->frame_control and vendor radiotap data are in skb head */ - if (!pskb_may_pull(origskb, 2 + rtap_vendor_space)) { + if (!pskb_may_pull(origskb, 2 + rtap_space)) { dev_kfree_skb(origskb); return NULL; } - only_monitor = should_drop_frame(origskb, present_fcs_len, - rtap_vendor_space); + only_monitor = should_drop_frame(origskb, present_fcs_len, rtap_space); if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) { if (only_monitor) { @@ -652,12 +652,11 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, return NULL; } - remove_monitor_info(origskb, present_fcs_len, - rtap_vendor_space); + remove_monitor_info(origskb, present_fcs_len, rtap_space); return origskb; } - ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_vendor_space); + ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_space); list_for_each_entry_rcu(sdata, &local->mon_list, u.mntr.list) { bool last_monitor = list_is_last(&sdata->u.mntr.list, @@ -665,8 +664,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (!monskb) monskb = ieee80211_make_monitor_skb(local, &origskb, - rate, - rtap_vendor_space, + rate, rtap_space, only_monitor && last_monitor); @@ -698,7 +696,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (!origskb) return NULL; - remove_monitor_info(origskb, present_fcs_len, rtap_vendor_space); + remove_monitor_info(origskb, present_fcs_len, rtap_space); return origskb; } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 655c3d8b0d80..6428f1ac37b6 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -3,6 +3,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -357,6 +358,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->last_connected = ktime_get_seconds(); ewma_signal_init(&sta->rx_stats_avg.signal); + ewma_avg_signal_init(&sta->status_stats.avg_ack_signal); for (i = 0; i < ARRAY_SIZE(sta->rx_stats_avg.chain_signal); i++) ewma_signal_init(&sta->rx_stats_avg.chain_signal[i]); @@ -1006,7 +1008,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta) sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL); if (sinfo) - sta_set_sinfo(sta, sinfo); + sta_set_sinfo(sta, sinfo, true); cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); @@ -1992,7 +1994,6 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, int band = STA_STATS_GET(LEGACY_BAND, rate); int rate_idx = STA_STATS_GET(LEGACY_IDX, rate); - rinfo->flags = 0; sband = local->hw.wiphy->bands[band]; brate = sband->bitrates[rate_idx].bitrate; if (rinfo->bw == RATE_INFO_BW_5) @@ -2051,6 +2052,18 @@ static void sta_set_tidstats(struct sta_info *sta, tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); tidstats->tx_msdu_failed = sta->status_stats.msdu_failed[tid]; } + + if (local->ops->wake_tx_queue && tid < IEEE80211_NUM_TIDS) { + spin_lock_bh(&local->fq.lock); + rcu_read_lock(); + + tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS); + ieee80211_fill_txq_stats(&tidstats->txq_stats, + to_txq_info(sta->sta.txq[tid])); + + rcu_read_unlock(); + spin_unlock_bh(&local->fq.lock); + } } static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) @@ -2066,7 +2079,8 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) return value; } -void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) +void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, + bool tidstats) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; @@ -2220,11 +2234,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE); } - sinfo->filled |= BIT(NL80211_STA_INFO_TID_STATS); - for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { - struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i]; + if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { + for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { + struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i]; - sta_set_tidstats(sta, tidstats, i); + sta_set_tidstats(sta, tidstats, i); + } } if (ieee80211_vif_is_mesh(&sdata->vif)) { @@ -2294,6 +2309,15 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->ack_signal = sta->status_stats.last_ack_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } + + if (ieee80211_hw_check(&sta->local->hw, REPORTS_TX_ACK_STATUS) && + !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG))) { + sinfo->avg_ack_signal = + -(s8)ewma_avg_signal_read( + &sta->status_stats.avg_ack_signal); + sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG); + } } u32 sta_get_expected_throughput(struct sta_info *sta) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index f64eb86ca64b..81b35f623792 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -119,6 +119,7 @@ enum ieee80211_sta_info_flags { #define HT_AGG_STATE_START_CB 6 #define HT_AGG_STATE_STOP_CB 7 +DECLARE_EWMA(avg_signal, 10, 8) enum ieee80211_agg_stop_reason { AGG_STOP_DECLINED, AGG_STOP_LOCAL_REQUEST, @@ -550,6 +551,7 @@ struct sta_info { unsigned long last_ack; s8 last_ack_signal; bool ack_signal_filled; + struct ewma_avg_signal avg_ack_signal; } status_stats; /* Updated from TX path only, no locking requirements */ @@ -742,7 +744,8 @@ static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata) void sta_set_rate_info_tx(struct sta_info *sta, const struct ieee80211_tx_rate *rate, struct rate_info *rinfo); -void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo); +void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, + bool tidstats); u32 sta_get_expected_throughput(struct sta_info *sta); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 743e89c5926c..9a6d7208bf4f 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -195,6 +195,8 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) sta->status_stats.last_ack_signal = (s8)txinfo->status.ack_signal; sta->status_stats.ack_signal_filled = true; + ewma_avg_signal_add(&sta->status_stats.avg_ack_signal, + -txinfo->status.ack_signal); } } diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 591ad02e1fa4..80a7edf8d314 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2,6 +2,7 @@ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH +* Copyright (C) 2018 Intel Corporation */ #if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) @@ -1413,11 +1414,29 @@ DEFINE_EVENT(release_evt, drv_allow_buffered_frames, TP_ARGS(local, sta, tids, num_frames, reason, more_data) ); -DEFINE_EVENT(local_sdata_evt, drv_mgd_prepare_tx, +TRACE_EVENT(drv_mgd_prepare_tx, TP_PROTO(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata), + struct ieee80211_sub_if_data *sdata, + u16 duration), - TP_ARGS(local, sdata) + TP_ARGS(local, sdata, duration), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u32, duration) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->duration = duration; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT " duration: %u", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration + ) ); DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover, diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 05a265cd573d..44b5dfe8727d 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1460,6 +1460,24 @@ void ieee80211_txq_purge(struct ieee80211_local *local, ieee80211_purge_tx_queue(&local->hw, &txqi->frags); } +void ieee80211_txq_set_params(struct ieee80211_local *local) +{ + if (local->hw.wiphy->txq_limit) + local->fq.limit = local->hw.wiphy->txq_limit; + else + local->hw.wiphy->txq_limit = local->fq.limit; + + if (local->hw.wiphy->txq_memory_limit) + local->fq.memory_limit = local->hw.wiphy->txq_memory_limit; + else + local->hw.wiphy->txq_memory_limit = local->fq.memory_limit; + + if (local->hw.wiphy->txq_quantum) + local->fq.quantum = local->hw.wiphy->txq_quantum; + else + local->hw.wiphy->txq_quantum = local->fq.quantum; +} + int ieee80211_txq_setup_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; @@ -1509,6 +1527,8 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) for (i = 0; i < fq->flows_cnt; i++) codel_vars_init(&local->cvars[i]); + ieee80211_txq_set_params(local); + return 0; } @@ -4085,6 +4105,31 @@ unlock: } EXPORT_SYMBOL(ieee80211_csa_update_counter); +void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct beacon_data *beacon = NULL; + + rcu_read_lock(); + + if (sdata->vif.type == NL80211_IFTYPE_AP) + beacon = rcu_dereference(sdata->u.ap.beacon); + else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + beacon = rcu_dereference(sdata->u.ibss.presp); + else if (ieee80211_vif_is_mesh(&sdata->vif)) + beacon = rcu_dereference(sdata->u.mesh.beacon); + + if (!beacon) + goto unlock; + + if (counter < beacon->csa_current_counter) + beacon->csa_current_counter = counter; + +unlock: + rcu_read_unlock(); +} +EXPORT_SYMBOL(ieee80211_csa_set_counter); + bool ieee80211_csa_is_complete(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 11f9cfc016d9..2d82c88efd0b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2793,12 +2793,13 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, memset(&ri, 0, sizeof(ri)); + ri.bw = status->bw; + /* Fill cfg80211 rate info */ switch (status->encoding) { case RX_ENC_HT: ri.mcs = status->rate_idx; ri.flags |= RATE_INFO_FLAGS_MCS; - ri.bw = status->bw; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; break; @@ -2806,7 +2807,6 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, ri.flags |= RATE_INFO_FLAGS_VHT_MCS; ri.mcs = status->rate_idx; ri.nss = status->nss; - ri.bw = status->bw; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; break; @@ -2818,8 +2818,6 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, int shift = 0; int bitrate; - ri.bw = status->bw; - switch (status->bw) { case RATE_INFO_BW_10: shift = 1; diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index ce9497966ebe..a6b7c7d5c829 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -347,7 +347,7 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr) cmd = (struct ncsi_cmd_svf_pkt *)skb_network_header(nr->cmd); ncf = &nc->vlan_filter; - if (cmd->index > ncf->n_vids) + if (cmd->index == 0 || cmd->index > ncf->n_vids) return -ERANGE; /* Add or remove the VLAN filter. Remember HW indexes from 1 */ @@ -445,7 +445,8 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr) ncf = &nc->mac_filter; bitmap = &ncf->bitmap; - if (cmd->index > ncf->n_uc + ncf->n_mc + ncf->n_mixed) + if (cmd->index == 0 || + cmd->index > ncf->n_uc + ncf->n_mc + ncf->n_mixed) return -ERANGE; index = (cmd->index - 1) * ETH_ALEN; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e57c9d479503..a5b60e6a983e 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -445,7 +445,7 @@ config NETFILTER_SYNPROXY endif # NF_CONNTRACK config NF_OSF - tristate 'Passive OS fingerprint infrastructure' + tristate config NF_TABLES select NETFILTER_NETLINK diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 0f6b8172fb9a..168af54db975 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -138,11 +138,6 @@ nf_hook_entries_grow(const struct nf_hook_entries *old, continue; } - if (reg->nat_hook && orig_ops[i]->nat_hook) { - kvfree(new); - return ERR_PTR(-EBUSY); - } - if (inserted || reg->priority > orig_ops[i]->priority) { new_ops[nhooks] = (void *)orig_ops[i]; new->hooks[nhooks] = old->hooks[i]; @@ -186,9 +181,31 @@ static void hooks_validate(const struct nf_hook_entries *hooks) #endif } +int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp, + const struct nf_hook_ops *reg) +{ + struct nf_hook_entries *new_hooks; + struct nf_hook_entries *p; + + p = rcu_dereference_raw(*pp); + new_hooks = nf_hook_entries_grow(p, reg); + if (IS_ERR(new_hooks)) + return PTR_ERR(new_hooks); + + hooks_validate(new_hooks); + + rcu_assign_pointer(*pp, new_hooks); + + BUG_ON(p == new_hooks); + nf_hook_entries_free(p); + return 0; +} +EXPORT_SYMBOL_GPL(nf_hook_entries_insert_raw); + /* * __nf_hook_entries_try_shrink - try to shrink hook array * + * @old -- current hook blob at @pp * @pp -- location of hook blob * * Hook unregistration must always succeed, so to-be-removed hooks @@ -201,14 +218,14 @@ static void hooks_validate(const struct nf_hook_entries *hooks) * * Returns address to free, or NULL. */ -static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp) +static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old, + struct nf_hook_entries __rcu **pp) { - struct nf_hook_entries *old, *new = NULL; unsigned int i, j, skip = 0, hook_entries; + struct nf_hook_entries *new = NULL; struct nf_hook_ops **orig_ops; struct nf_hook_ops **new_ops; - old = nf_entry_dereference(*pp); if (WARN_ON_ONCE(!old)) return NULL; @@ -347,11 +364,10 @@ static int __nf_register_net_hook(struct net *net, int pf, * This cannot fail, hook unregistration must always succeed. * Therefore replace the to-be-removed hook with a dummy hook. */ -static void nf_remove_net_hook(struct nf_hook_entries *old, - const struct nf_hook_ops *unreg, int pf) +static bool nf_remove_net_hook(struct nf_hook_entries *old, + const struct nf_hook_ops *unreg) { struct nf_hook_ops **orig_ops; - bool found = false; unsigned int i; orig_ops = nf_hook_entries_get_hook_ops(old); @@ -360,21 +376,10 @@ static void nf_remove_net_hook(struct nf_hook_entries *old, continue; WRITE_ONCE(old->hooks[i].hook, accept_all); WRITE_ONCE(orig_ops[i], &dummy_ops); - found = true; - break; + return true; } - if (found) { -#ifdef CONFIG_NETFILTER_INGRESS - if (pf == NFPROTO_NETDEV && unreg->hooknum == NF_NETDEV_INGRESS) - net_dec_ingress_queue(); -#endif -#ifdef HAVE_JUMP_LABEL - static_key_slow_dec(&nf_hooks_needed[pf][unreg->hooknum]); -#endif - } else { - WARN_ONCE(1, "hook not found, pf %d num %d", pf, unreg->hooknum); - } + return false; } static void __nf_unregister_net_hook(struct net *net, int pf, @@ -395,9 +400,19 @@ static void __nf_unregister_net_hook(struct net *net, int pf, return; } - nf_remove_net_hook(p, reg, pf); + if (nf_remove_net_hook(p, reg)) { +#ifdef CONFIG_NETFILTER_INGRESS + if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + net_dec_ingress_queue(); +#endif +#ifdef HAVE_JUMP_LABEL + static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]); +#endif + } else { + WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum); + } - p = __nf_hook_entries_try_shrink(pp); + p = __nf_hook_entries_try_shrink(p, pp); mutex_unlock(&nf_hook_mutex); if (!p) return; @@ -417,6 +432,19 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) } EXPORT_SYMBOL(nf_unregister_net_hook); +void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp, + const struct nf_hook_ops *reg) +{ + struct nf_hook_entries *p; + + p = rcu_dereference_raw(*pp); + if (nf_remove_net_hook(p, reg)) { + p = __nf_hook_entries_try_shrink(p, pp); + nf_hook_entries_free(p); + } +} +EXPORT_SYMBOL_GPL(nf_hook_entries_delete_raw); + int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { int err; @@ -535,6 +563,9 @@ EXPORT_SYMBOL(skb_make_writable); struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nfnl_ct_hook); +struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_hook); + #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence @@ -543,6 +574,9 @@ void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu __read_mostly; EXPORT_SYMBOL(ip_ct_attach); +struct nf_nat_hook __rcu *nf_nat_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_hook); + void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) { void (*attach)(struct sk_buff *, const struct sk_buff *); @@ -557,17 +591,14 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) } EXPORT_SYMBOL(nf_ct_attach); -void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly; -EXPORT_SYMBOL(nf_ct_destroy); - void nf_conntrack_destroy(struct nf_conntrack *nfct) { - void (*destroy)(struct nf_conntrack *); + struct nf_ct_hook *ct_hook; rcu_read_lock(); - destroy = rcu_dereference(nf_ct_destroy); - BUG_ON(destroy == NULL); - destroy(nfct); + ct_hook = rcu_dereference(nf_ct_hook); + BUG_ON(ct_hook == NULL); + ct_hook->destroy(nfct); rcu_read_unlock(); } EXPORT_SYMBOL(nf_conntrack_destroy); @@ -580,12 +611,8 @@ const struct nf_conntrack_zone nf_ct_zone_dflt = { EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); #endif /* CONFIG_NF_CONNTRACK */ -#ifdef CONFIG_NF_NAT_NEEDED -void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); -EXPORT_SYMBOL(nf_nat_decode_session_hook); -#endif - -static void __net_init __netfilter_net_init(struct nf_hook_entries **e, int max) +static void __net_init +__netfilter_net_init(struct nf_hook_entries __rcu **e, int max) { int h; diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 370abbf6f421..75de46576f51 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -232,7 +232,10 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) { unsigned int hash; - bool ret; + bool ret = false; + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return refcount_dec_if_one(&cp->refcnt); hash = ip_vs_conn_hashkey_conn(cp); @@ -240,15 +243,13 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { - ret = false; /* Decrease refcnt and unlink conn only if we are last user */ if (refcount_dec_if_one(&cp->refcnt)) { hlist_del_rcu(&cp->c_list); cp->flags &= ~IP_VS_CONN_F_HASHED; ret = true; } - } else - ret = refcount_read(&cp->refcnt) ? false : true; + } spin_unlock(&cp->lock); ct_write_unlock_bh(hash); @@ -454,12 +455,6 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, } EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); -static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp) -{ - __ip_vs_conn_put(cp); - ip_vs_conn_expire(&cp->timer); -} - /* * Put back the conn and restart its timer with its timeout */ @@ -478,7 +473,7 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) (refcount_read(&cp->refcnt) == 1) && !timer_pending(&cp->timer)) /* expire connection immediately */ - __ip_vs_conn_put_notimer(cp); + ip_vs_conn_expire(&cp->timer); else __ip_vs_conn_put_timer(cp); } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5f6f73cf2174..0679dd101e72 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -119,6 +119,8 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_cpu_stats *s; struct ip_vs_service *svc; + local_bh_disable(); + s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); s->cnt.inpkts++; @@ -137,6 +139,8 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->cnt.inpkts++; s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); + + local_bh_enable(); } } @@ -151,6 +155,8 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_cpu_stats *s; struct ip_vs_service *svc; + local_bh_disable(); + s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); s->cnt.outpkts++; @@ -169,6 +175,8 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->cnt.outpkts++; s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); + + local_bh_enable(); } } @@ -179,6 +187,8 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_cpu_stats *s; + local_bh_disable(); + s = this_cpu_ptr(cp->dest->stats.cpustats); u64_stats_update_begin(&s->syncp); s->cnt.conns++; @@ -193,6 +203,8 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) u64_stats_update_begin(&s->syncp); s->cnt.conns++; u64_stats_update_end(&s->syncp); + + local_bh_enable(); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 605441727008..3465da2a98bd 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -58,11 +58,6 @@ #include "nf_internals.h" -int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, - enum nf_nat_manip_type manip, - const struct nlattr *attr) __read_mostly; -EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); - __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; EXPORT_SYMBOL_GPL(nf_conntrack_locks); @@ -1612,6 +1607,82 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) nf_conntrack_get(skb_nfct(nskb)); } +static int nf_conntrack_update(struct net *net, struct sk_buff *skb) +{ + const struct nf_conntrack_l3proto *l3proto; + const struct nf_conntrack_l4proto *l4proto; + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + enum ip_conntrack_info ctinfo; + struct nf_nat_hook *nat_hook; + unsigned int dataoff, status; + struct nf_conn *ct; + u16 l3num; + u8 l4num; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || nf_ct_is_confirmed(ct)) + return 0; + + l3num = nf_ct_l3num(ct); + l3proto = nf_ct_l3proto_find_get(l3num); + + if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, + &l4num) <= 0) + return -1; + + l4proto = nf_ct_l4proto_find_get(l3num, l4num); + + if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, + l4num, net, &tuple, l3proto, l4proto)) + return -1; + + if (ct->status & IPS_SRC_NAT) { + memcpy(tuple.src.u3.all, + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, + sizeof(tuple.src.u3.all)); + tuple.src.u.all = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; + } + + if (ct->status & IPS_DST_NAT) { + memcpy(tuple.dst.u3.all, + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, + sizeof(tuple.dst.u3.all)); + tuple.dst.u.all = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; + } + + h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); + if (!h) + return 0; + + /* Store status bits of the conntrack that is clashing to re-do NAT + * mangling according to what it has been done already to this packet. + */ + status = ct->status; + + nf_ct_put(ct); + ct = nf_ct_tuplehash_to_ctrack(h); + nf_ct_set(skb, ct, ctinfo); + + nat_hook = rcu_dereference(nf_nat_hook); + if (!nat_hook) + return 0; + + if (status & IPS_SRC_NAT && + nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, + IP_CT_DIR_ORIGINAL) == NF_DROP) + return -1; + + if (status & IPS_DST_NAT && + nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, + IP_CT_DIR_ORIGINAL) == NF_DROP) + return -1; + + return 0; +} + /* Bring out ya dead! */ static struct nf_conn * get_next_corpse(int (*iter)(struct nf_conn *i, void *data), @@ -1813,8 +1884,7 @@ void nf_conntrack_cleanup_start(void) void nf_conntrack_cleanup_end(void) { - RCU_INIT_POINTER(nf_ct_destroy, NULL); - + RCU_INIT_POINTER(nf_ct_hook, NULL); cancel_delayed_work_sync(&conntrack_gc_work.dwork); nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); @@ -2131,11 +2201,16 @@ err_cachep: return ret; } +static struct nf_ct_hook nf_conntrack_hook = { + .update = nf_conntrack_update, + .destroy = destroy_conntrack, +}; + void nf_conntrack_init_end(void) { /* For use by REJECT target */ RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); - RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); + RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); } /* diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d807b8770be3..39327a42879f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1431,11 +1431,11 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { - typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup; + struct nf_nat_hook *nat_hook; int err; - parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook); - if (!parse_nat_setup) { + nat_hook = rcu_dereference(nf_nat_hook); + if (!nat_hook) { #ifdef CONFIG_MODULES rcu_read_unlock(); nfnl_unlock(NFNL_SUBSYS_CTNETLINK); @@ -1446,13 +1446,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, } nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); - if (nfnetlink_parse_nat_setup_hook) + if (nat_hook->parse_nat_setup) return -EAGAIN; #endif return -EOPNOTSUPP; } - err = parse_nat_setup(ct, manip, attr); + err = nat_hook->parse_nat_setup(ct, manip, attr); if (err == -EAGAIN) { #ifdef CONFIG_MODULES rcu_read_unlock(); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index e97cdc1cf98c..8e67910185a0 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -981,6 +981,17 @@ static int tcp_packet(struct nf_conn *ct, return NF_ACCEPT; /* Don't change state */ } break; + case TCP_CONNTRACK_SYN_SENT2: + /* tcp_conntracks table is not smart enough to handle + * simultaneous open. + */ + ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN; + break; + case TCP_CONNTRACK_SYN_RECV: + if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET && + ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN) + new_state = TCP_CONNTRACK_ESTABLISHED; + break; case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 18f6d7ae995b..e15779fd58e3 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -15,4 +15,9 @@ void nf_queue_nf_hook_drop(struct net *net); /* nf_log.c */ int __init netfilter_log_init(void); +/* core.c */ +void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp, + const struct nf_hook_ops *reg); +int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp, + const struct nf_hook_ops *reg); #endif diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 37b3c9913b08..821f8d835f7a 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -32,6 +32,8 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_nat.h> +#include "nf_internals.h" + static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); @@ -39,11 +41,27 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; +static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; static unsigned int nf_nat_hash_rnd __read_mostly; +struct nf_nat_lookup_hook_priv { + struct nf_hook_entries __rcu *entries; + + struct rcu_head rcu_head; +}; + +struct nf_nat_hooks_net { + struct nf_hook_ops *nat_hook_ops; + unsigned int users; +}; + +struct nat_net { + struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; +}; + inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) { @@ -475,17 +493,36 @@ nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) } EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); +static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct, + enum nf_nat_manip_type mtype, + enum ip_conntrack_dir dir) +{ + const struct nf_nat_l3proto *l3proto; + const struct nf_nat_l4proto *l4proto; + struct nf_conntrack_tuple target; + + /* We are aiming to look like inverse of other direction. */ + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + + l3proto = __nf_nat_l3proto_find(target.src.l3num); + l4proto = __nf_nat_l4proto_find(target.src.l3num, + target.dst.protonum); + if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype)) + return NF_DROP; + + return NF_ACCEPT; +} + /* Do packet manipulations according to nf_nat_setup_info. */ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb) { - const struct nf_nat_l3proto *l3proto; - const struct nf_nat_l4proto *l4proto; + enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned int verdict = NF_ACCEPT; unsigned long statusbit; - enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); if (mtype == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; @@ -497,21 +534,87 @@ unsigned int nf_nat_packet(struct nf_conn *ct, statusbit ^= IPS_NAT_MASK; /* Non-atomic: these bits don't change. */ - if (ct->status & statusbit) { - struct nf_conntrack_tuple target; + if (ct->status & statusbit) + verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); + + return verdict; +} +EXPORT_SYMBOL_GPL(nf_nat_packet); + +unsigned int +nf_nat_inet_fn(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + /* maniptype == SRC for postrouting. */ + enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); + + ct = nf_ct_get(skb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + * have dropped it. Hence it's the user's responsibilty to + * packet filter it out, or implement conntrack/NAT for that + * protocol. 8) --RR + */ + if (!ct) + return NF_ACCEPT; - /* We are aiming to look like inverse of other direction. */ - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + nat = nfct_nat(ct); - l3proto = __nf_nat_l3proto_find(target.src.l3num); - l4proto = __nf_nat_l4proto_find(target.src.l3num, - target.dst.protonum); - if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype)) - return NF_DROP; + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + struct nf_nat_lookup_hook_priv *lpriv = priv; + struct nf_hook_entries *e = rcu_dereference(lpriv->entries); + unsigned int ret; + int i; + + if (!e) + goto null_bind; + + for (i = 0; i < e->num_hook_entries; i++) { + ret = e->hooks[i].hook(e->hooks[i].priv, skb, + state); + if (ret != NF_ACCEPT) + return ret; + if (nf_nat_initialized(ct, maniptype)) + goto do_nat; + } +null_bind: + ret = nf_nat_alloc_null_binding(ct, state->hook); + if (ret != NF_ACCEPT) + return ret; + } else { + pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", + maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", + ct, ct->status); + if (nf_nat_oif_changed(state->hook, ctinfo, nat, + state->out)) + goto oif_changed; + } + break; + default: + /* ESTABLISHED */ + WARN_ON(ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) + goto oif_changed; } - return NF_ACCEPT; +do_nat: + return nf_nat_packet(ct, ctinfo, state->hook, skb); + +oif_changed: + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; } -EXPORT_SYMBOL_GPL(nf_nat_packet); +EXPORT_SYMBOL_GPL(nf_nat_inet_fn); struct nf_nat_proto_clean { u8 l3proto; @@ -801,6 +904,146 @@ static struct nf_ct_helper_expectfn follow_master_nat = { .expectfn = nf_nat_follow_master, }; +int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops, + const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) +{ + struct nat_net *nat_net = net_generic(net, nat_net_id); + struct nf_nat_hooks_net *nat_proto_net; + struct nf_nat_lookup_hook_priv *priv; + unsigned int hooknum = ops->hooknum; + struct nf_hook_ops *nat_ops; + int i, ret; + + if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))) + return -EINVAL; + + nat_proto_net = &nat_net->nat_proto_net[ops->pf]; + + for (i = 0; i < ops_count; i++) { + if (WARN_ON(orig_nat_ops[i].pf != ops->pf)) + return -EINVAL; + if (orig_nat_ops[i].hooknum == hooknum) { + hooknum = i; + break; + } + } + + if (WARN_ON_ONCE(i == ops_count)) + return -EINVAL; + + mutex_lock(&nf_nat_proto_mutex); + if (!nat_proto_net->nat_hook_ops) { + WARN_ON(nat_proto_net->users != 0); + + nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); + if (!nat_ops) { + mutex_unlock(&nf_nat_proto_mutex); + return -ENOMEM; + } + + for (i = 0; i < ops_count; i++) { + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + nat_ops[i].priv = priv; + continue; + } + mutex_unlock(&nf_nat_proto_mutex); + while (i) + kfree(nat_ops[--i].priv); + kfree(nat_ops); + return -ENOMEM; + } + + ret = nf_register_net_hooks(net, nat_ops, ops_count); + if (ret < 0) { + mutex_unlock(&nf_nat_proto_mutex); + for (i = 0; i < ops_count; i++) + kfree(nat_ops[i].priv); + kfree(nat_ops); + return ret; + } + + nat_proto_net->nat_hook_ops = nat_ops; + } + + nat_ops = nat_proto_net->nat_hook_ops; + priv = nat_ops[hooknum].priv; + if (WARN_ON_ONCE(!priv)) { + mutex_unlock(&nf_nat_proto_mutex); + return -EOPNOTSUPP; + } + + ret = nf_hook_entries_insert_raw(&priv->entries, ops); + if (ret == 0) + nat_proto_net->users++; + + mutex_unlock(&nf_nat_proto_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_register_fn); + +void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops, + unsigned int ops_count) +{ + struct nat_net *nat_net = net_generic(net, nat_net_id); + struct nf_nat_hooks_net *nat_proto_net; + struct nf_nat_lookup_hook_priv *priv; + struct nf_hook_ops *nat_ops; + int hooknum = ops->hooknum; + int i; + + if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)) + return; + + nat_proto_net = &nat_net->nat_proto_net[ops->pf]; + + mutex_lock(&nf_nat_proto_mutex); + if (WARN_ON(nat_proto_net->users == 0)) + goto unlock; + + nat_proto_net->users--; + + nat_ops = nat_proto_net->nat_hook_ops; + for (i = 0; i < ops_count; i++) { + if (nat_ops[i].hooknum == hooknum) { + hooknum = i; + break; + } + } + if (WARN_ON_ONCE(i == ops_count)) + goto unlock; + priv = nat_ops[hooknum].priv; + nf_hook_entries_delete_raw(&priv->entries, ops); + + if (nat_proto_net->users == 0) { + nf_unregister_net_hooks(net, nat_ops, ops_count); + + for (i = 0; i < ops_count; i++) { + priv = nat_ops[i].priv; + kfree_rcu(priv, rcu_head); + } + + nat_proto_net->nat_hook_ops = NULL; + kfree(nat_ops); + } +unlock: + mutex_unlock(&nf_nat_proto_mutex); +} +EXPORT_SYMBOL_GPL(nf_nat_unregister_fn); + +static struct pernet_operations nat_net_ops = { + .id = &nat_net_id, + .size = sizeof(struct nat_net), +}; + +struct nf_nat_hook nat_hook = { + .parse_nat_setup = nfnetlink_parse_nat_setup, +#ifdef CONFIG_XFRM + .decode_session = __nf_nat_decode_session, +#endif + .manip_pkt = nf_nat_manip_pkt, +}; + static int __init nf_nat_init(void) { int ret, i; @@ -824,15 +1067,17 @@ static int __init nf_nat_init(void) for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); + ret = register_pernet_subsys(&nat_net_ops); + if (ret < 0) { + nf_ct_extend_unregister(&nat_extend); + return ret; + } + nf_ct_helper_expectfn_register(&follow_master_nat); - BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); - RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, - nfnetlink_parse_nat_setup); -#ifdef CONFIG_XFRM - BUG_ON(nf_nat_decode_session_hook != NULL); - RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session); -#endif + WARN_ON(nf_nat_hook != NULL); + RCU_INIT_POINTER(nf_nat_hook, &nat_hook); + return 0; } @@ -845,16 +1090,15 @@ static void __exit nf_nat_cleanup(void) nf_ct_extend_unregister(&nat_extend); nf_ct_helper_expectfn_unregister(&follow_master_nat); - RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); -#ifdef CONFIG_XFRM - RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); -#endif + RCU_INIT_POINTER(nf_nat_hook, NULL); + synchronize_rcu(); for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); synchronize_net(); nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + unregister_pernet_subsys(&nat_net_ops); } MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 18bd584fadda..87b2a77add65 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -74,88 +74,43 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } -/* removal requests are queued in the commit_list, but not acted upon - * until after all new rules are in place. - * - * Therefore, nf_register_net_hook(net, &nat_hook) runs before pending - * nf_unregister_net_hook(). - * - * nf_register_net_hook thus fails if a nat hook is already in place - * even if the conflicting hook is about to be removed. - * - * If collision is detected, search commit_log for DELCHAIN matching - * the new nat hooknum; if we find one collision is temporary: - * - * Either transaction is aborted (new/colliding hook is removed), or - * transaction is committed (old hook is removed). - */ -static bool nf_tables_allow_nat_conflict(const struct net *net, - const struct nf_hook_ops *ops) -{ - const struct nft_trans *trans; - bool ret = false; - - if (!ops->nat_hook) - return false; - - list_for_each_entry(trans, &net->nft.commit_list, list) { - const struct nf_hook_ops *pending_ops; - const struct nft_chain *pending; - - if (trans->msg_type != NFT_MSG_NEWCHAIN && - trans->msg_type != NFT_MSG_DELCHAIN) - continue; - - pending = trans->ctx.chain; - if (!nft_is_base_chain(pending)) - continue; - - pending_ops = &nft_base_chain(pending)->ops; - if (pending_ops->nat_hook && - pending_ops->pf == ops->pf && - pending_ops->hooknum == ops->hooknum) { - /* other hook registration already pending? */ - if (trans->msg_type == NFT_MSG_NEWCHAIN) - return false; - - ret = true; - } - } - - return ret; -} - static int nf_tables_register_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { - struct nf_hook_ops *ops; - int ret; + const struct nft_base_chain *basechain; + const struct nf_hook_ops *ops; if (table->flags & NFT_TABLE_F_DORMANT || !nft_is_base_chain(chain)) return 0; - ops = &nft_base_chain(chain)->ops; - ret = nf_register_net_hook(net, ops); - if (ret == -EBUSY && nf_tables_allow_nat_conflict(net, ops)) { - ops->nat_hook = false; - ret = nf_register_net_hook(net, ops); - ops->nat_hook = true; - } + basechain = nft_base_chain(chain); + ops = &basechain->ops; - return ret; + if (basechain->type->ops_register) + return basechain->type->ops_register(net, ops); + + return nf_register_net_hook(net, ops); } static void nf_tables_unregister_hook(struct net *net, const struct nft_table *table, struct nft_chain *chain) { + const struct nft_base_chain *basechain; + const struct nf_hook_ops *ops; + if (table->flags & NFT_TABLE_F_DORMANT || !nft_is_base_chain(chain)) return; + basechain = nft_base_chain(chain); + ops = &basechain->ops; + + if (basechain->type->ops_unregister) + return basechain->type->ops_unregister(net, ops); - nf_unregister_net_hook(net, &nft_base_chain(chain)->ops); + nf_unregister_net_hook(net, ops); } static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) @@ -214,6 +169,34 @@ static int nft_delchain(struct nft_ctx *ctx) return err; } +static void nft_rule_expr_activate(const struct nft_ctx *ctx, + struct nft_rule *rule) +{ + struct nft_expr *expr; + + expr = nft_expr_first(rule); + while (expr != nft_expr_last(rule) && expr->ops) { + if (expr->ops->activate) + expr->ops->activate(ctx, expr); + + expr = nft_expr_next(expr); + } +} + +static void nft_rule_expr_deactivate(const struct nft_ctx *ctx, + struct nft_rule *rule) +{ + struct nft_expr *expr; + + expr = nft_expr_first(rule); + while (expr != nft_expr_last(rule) && expr->ops) { + if (expr->ops->deactivate) + expr->ops->deactivate(ctx, expr); + + expr = nft_expr_next(expr); + } +} + static int nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) { @@ -259,6 +242,7 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) nft_trans_destroy(trans); return err; } + nft_rule_expr_deactivate(ctx, rule); return 0; } @@ -1262,8 +1246,6 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); - if (basechain->type->free) - basechain->type->free(ctx); module_put(basechain->type->owner); free_percpu(basechain->stats); if (basechain->stats) @@ -1396,9 +1378,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, } basechain->type = hook.type; - if (basechain->type->init) - basechain->type->init(ctx); - chain = &basechain->chain; ops = &basechain->ops; @@ -1409,9 +1388,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, ops->hook = hook.type->hooks[ops->hooknum]; ops->dev = hook.dev; - if (basechain->type->type == NFT_CHAIN_T_NAT) - ops->nat_hook = true; - chain->flags |= NFT_BASE_CHAIN; basechain->policy = policy; } else { @@ -2242,6 +2218,13 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } +static void nf_tables_rule_release(const struct nft_ctx *ctx, + struct nft_rule *rule) +{ + nft_rule_expr_deactivate(ctx, rule); + nf_tables_rule_destroy(ctx, rule); +} + #define NFT_RULE_MAXEXPRS 128 static struct nft_expr_info *info; @@ -2415,7 +2398,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return 0; err2: - nf_tables_rule_destroy(&ctx, rule); + nf_tables_rule_release(&ctx, rule); err1: for (i = 0; i < n; i++) { if (info[i].ops != NULL) @@ -4098,8 +4081,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ - nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) - return -EBUSY; + nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) { + err = -EBUSY; + goto err5; + } if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && memcmp(nft_set_ext_data(ext), @@ -4185,7 +4170,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, * NFT_GOTO verdicts. This function must be called on active data objects * from the second phase of the commit protocol. */ -static void nft_data_hold(const struct nft_data *data, enum nft_data_types type) +void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { @@ -5821,7 +5806,7 @@ static void nft_chain_commit_update(struct nft_trans *trans) } } -static void nf_tables_commit_release(struct nft_trans *trans) +static void nft_commit_release(struct nft_trans *trans) { switch (trans->msg_type) { case NFT_MSG_DELTABLE: @@ -5850,6 +5835,21 @@ static void nf_tables_commit_release(struct nft_trans *trans) kfree(trans); } +static void nf_tables_commit_release(struct net *net) +{ + struct nft_trans *trans, *next; + + if (list_empty(&net->nft.commit_list)) + return; + + synchronize_rcu(); + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_del(&trans->list); + nft_commit_release(trans); + } +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nft_trans *trans, *next; @@ -5980,13 +5980,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } } - synchronize_rcu(); - - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { - list_del(&trans->list); - nf_tables_commit_release(trans); - } - + nf_tables_commit_release(net); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); return 0; @@ -6066,10 +6060,12 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWRULE: trans->ctx.chain->use--; list_del_rcu(&nft_trans_rule(trans)->list); + nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELRULE: trans->ctx.chain->use++; nft_clear(trans->ctx.net, nft_trans_rule(trans)); + nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: @@ -6645,7 +6641,7 @@ int __nft_release_basechain(struct nft_ctx *ctx) list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { list_del(&rule->list); ctx->chain->use--; - nf_tables_rule_destroy(ctx, rule); + nf_tables_rule_release(ctx, rule); } list_del(&ctx->chain->list); ctx->table->use--; @@ -6683,7 +6679,7 @@ static void __nft_release_tables(struct net *net) list_for_each_entry_safe(rule, nr, &chain->rules, list) { list_del(&rule->list); chain->use--; - nf_tables_rule_destroy(&ctx, rule); + nf_tables_rule_release(&ctx, rule); } } list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 9cf47c4cb9d5..4f46d2f4e167 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -41,7 +41,7 @@ static const struct nf_loginfo trace_loginfo = { static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, - int rulenum, enum nft_trace_types type) + enum nft_trace_types type) { const struct nft_pktinfo *pkt = info->pkt; @@ -52,22 +52,16 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info, info->type = type; nft_trace_notify(info); - - nf_log_trace(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb, - nft_in(pkt), nft_out(pkt), &trace_loginfo, - "TRACE: %s:%s:%s:%u ", - chain->table->name, chain->name, comments[type], rulenum); } static inline void nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, const struct nft_rule *rule, - int rulenum, enum nft_trace_types type) { if (static_branch_unlikely(&nft_trace_enabled)) { info->rule = rule; - __nft_trace_packet(info, chain, rulenum, type); + __nft_trace_packet(info, chain, type); } } @@ -119,21 +113,27 @@ DEFINE_STATIC_KEY_FALSE(nft_counters_enabled); static noinline void nft_update_chain_stats(const struct nft_chain *chain, const struct nft_pktinfo *pkt) { + struct nft_base_chain *base_chain; struct nft_stats *stats; - local_bh_disable(); - stats = this_cpu_ptr(rcu_dereference(nft_base_chain(chain)->stats)); - u64_stats_update_begin(&stats->syncp); - stats->pkts++; - stats->bytes += pkt->skb->len; - u64_stats_update_end(&stats->syncp); - local_bh_enable(); + base_chain = nft_base_chain(chain); + if (!base_chain->stats) + return; + + stats = this_cpu_ptr(rcu_dereference(base_chain->stats)); + if (stats) { + local_bh_disable(); + u64_stats_update_begin(&stats->syncp); + stats->pkts++; + stats->bytes += pkt->skb->len; + u64_stats_update_end(&stats->syncp); + local_bh_enable(); + } } struct nft_jumpstack { const struct nft_chain *chain; const struct nft_rule *rule; - int rulenum; }; unsigned int @@ -146,7 +146,6 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) struct nft_regs regs; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; - int rulenum; unsigned int gencursor = nft_genmask_cur(net); struct nft_traceinfo info; @@ -154,7 +153,6 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) if (static_branch_unlikely(&nft_trace_enabled)) nft_trace_init(&info, pkt, ®s.verdict, basechain); do_chain: - rulenum = 0; rule = list_entry(&chain->rules, struct nft_rule, list); next_rule: regs.verdict.code = NFT_CONTINUE; @@ -164,8 +162,6 @@ next_rule: if (unlikely(rule->genmask & gencursor)) continue; - rulenum++; - nft_rule_for_each_expr(expr, last, rule) { if (expr->ops == &nft_cmp_fast_ops) nft_cmp_fast_eval(expr, ®s); @@ -183,7 +179,7 @@ next_rule: continue; case NFT_CONTINUE: nft_trace_packet(&info, chain, rule, - rulenum, NFT_TRACETYPE_RULE); + NFT_TRACETYPE_RULE); continue; } break; @@ -195,7 +191,7 @@ next_rule: case NF_QUEUE: case NF_STOLEN: nft_trace_packet(&info, chain, rule, - rulenum, NFT_TRACETYPE_RULE); + NFT_TRACETYPE_RULE); return regs.verdict.code; } @@ -204,21 +200,19 @@ next_rule: BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE); jumpstack[stackptr].chain = chain; jumpstack[stackptr].rule = rule; - jumpstack[stackptr].rulenum = rulenum; stackptr++; /* fall through */ case NFT_GOTO: nft_trace_packet(&info, chain, rule, - rulenum, NFT_TRACETYPE_RULE); + NFT_TRACETYPE_RULE); chain = regs.verdict.chain; goto do_chain; case NFT_CONTINUE: - rulenum++; /* fall through */ case NFT_RETURN: nft_trace_packet(&info, chain, rule, - rulenum, NFT_TRACETYPE_RETURN); + NFT_TRACETYPE_RETURN); break; default: WARN_ON(1); @@ -228,12 +222,10 @@ next_rule: stackptr--; chain = jumpstack[stackptr].chain; rule = jumpstack[stackptr].rule; - rulenum = jumpstack[stackptr].rulenum; goto next_rule; } - nft_trace_packet(&info, basechain, NULL, -1, - NFT_TRACETYPE_POLICY); + nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY); if (static_branch_unlikely(&nft_counters_enabled)) nft_update_chain_stats(basechain, pkt); diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index b9505bcd3827..6ddf89183e7b 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -115,7 +115,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, nfacct->flags = flags; } - strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX); + nla_strlcpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX); if (tb[NFACCT_BYTES]) { atomic64_set(&nfacct->bytes, diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 4a4b293fb2e5..fa026b269b36 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -149,8 +149,8 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) return -EINVAL; - strncpy(expect_policy->name, - nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN); + nla_strlcpy(expect_policy->name, + nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN); expect_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) @@ -234,7 +234,8 @@ nfnl_cthelper_create(const struct nlattr * const tb[], if (ret < 0) goto err1; - strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN); + nla_strlcpy(helper->name, + nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN); size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size > FIELD_SIZEOF(struct nf_conn_help, data)) { ret = -ENOMEM; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 74a04638ef03..2c173042ac0e 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -227,6 +227,25 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) return entry; } +static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) +{ + struct nf_ct_hook *ct_hook; + int err; + + if (verdict == NF_ACCEPT || + verdict == NF_STOP) { + rcu_read_lock(); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) { + err = ct_hook->update(entry->state.net, entry->skb); + if (err < 0) + verdict = NF_DROP; + } + rcu_read_unlock(); + } + nf_reinject(entry, verdict); +} + static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) { @@ -237,7 +256,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) if (!cmpfn || cmpfn(entry, data)) { list_del(&entry->list); queue->queue_total--; - nf_reinject(entry, NF_DROP); + nfqnl_reinject(entry, NF_DROP); } } spin_unlock_bh(&queue->lock); @@ -686,7 +705,7 @@ err_out_free_nskb: err_out_unlock: spin_unlock_bh(&queue->lock); if (failopen) - nf_reinject(entry, NF_ACCEPT); + nfqnl_reinject(entry, NF_ACCEPT); err_out: return err; } @@ -1085,7 +1104,8 @@ static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl, list_for_each_entry_safe(entry, tmp, &batch_list, list) { if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); - nf_reinject(entry, verdict); + + nfqnl_reinject(entry, verdict); } return 0; } @@ -1208,7 +1228,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); - nf_reinject(entry, verdict); + nfqnl_reinject(entry, verdict); return 0; } diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 8e23726b9081..1d99a1efdafc 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -27,14 +27,31 @@ struct nft_xt { struct list_head head; struct nft_expr_ops ops; unsigned int refcnt; + + /* Unlike other expressions, ops doesn't have static storage duration. + * nft core assumes they do. We use kfree_rcu so that nft core can + * can check expr->ops->size even after nft_compat->destroy() frees + * the nft_xt struct that holds the ops structure. + */ + struct rcu_head rcu_head; +}; + +/* Used for matches where *info is larger than X byte */ +#define NFT_MATCH_LARGE_THRESH 192 + +struct nft_xt_match_priv { + void *info; }; -static void nft_xt_put(struct nft_xt *xt) +static bool nft_xt_put(struct nft_xt *xt) { if (--xt->refcnt == 0) { list_del(&xt->head); - kfree(xt); + kfree_rcu(xt, rcu_head); + return true; } + + return false; } static int nft_compat_chain_validate_dependency(const char *tablename, @@ -226,6 +243,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct xt_target *target = expr->ops->data; struct xt_tgchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO])); + struct nft_xt *nft_xt; u16 proto = 0; bool inv = false; union nft_entry e = {}; @@ -236,25 +254,22 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (ctx->nla[NFTA_RULE_COMPAT]) { ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); if (ret < 0) - goto err; + return ret; } nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); ret = xt_check_target(&par, size, proto, inv); if (ret < 0) - goto err; + return ret; /* The standard target cannot be used */ - if (target->target == NULL) { - ret = -EINVAL; - goto err; - } + if (!target->target) + return -EINVAL; + nft_xt = container_of(expr->ops, struct nft_xt, ops); + nft_xt->refcnt++; return 0; -err: - module_put(target->me); - return ret; } static void @@ -271,8 +286,8 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) if (par.target->destroy != NULL) par.target->destroy(&par); - nft_xt_put(container_of(expr->ops, struct nft_xt, ops)); - module_put(target->me); + if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops))) + module_put(target->me); } static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -316,11 +331,11 @@ static int nft_target_validate(const struct nft_ctx *ctx, return 0; } -static void nft_match_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void __nft_match_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + void *info) { - void *info = nft_expr_priv(expr); struct xt_match *match = expr->ops->data; struct sk_buff *skb = pkt->skb; bool ret; @@ -344,6 +359,22 @@ static void nft_match_eval(const struct nft_expr *expr, } } +static void nft_match_large_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_xt_match_priv *priv = nft_expr_priv(expr); + + __nft_match_eval(expr, regs, pkt, priv->info); +} + +static void nft_match_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + __nft_match_eval(expr, regs, pkt, nft_expr_priv(expr)); +} + static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING }, [NFTA_MATCH_REV] = { .type = NLA_U32 }, @@ -404,13 +435,14 @@ static void match_compat_from_user(struct xt_match *m, void *in, void *out) } static int -nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nlattr * const tb[]) +__nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[], + void *info) { - void *info = nft_expr_priv(expr); struct xt_match *match = expr->ops->data; struct xt_mtchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO])); + struct nft_xt *nft_xt; u16 proto = 0; bool inv = false; union nft_entry e = {}; @@ -421,26 +453,50 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (ctx->nla[NFTA_RULE_COMPAT]) { ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv); if (ret < 0) - goto err; + return ret; } nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); ret = xt_check_match(&par, size, proto, inv); if (ret < 0) - goto err; + return ret; + nft_xt = container_of(expr->ops, struct nft_xt, ops); + nft_xt->refcnt++; return 0; -err: - module_put(match->me); +} + +static int +nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + return __nft_match_init(ctx, expr, tb, nft_expr_priv(expr)); +} + +static int +nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_xt_match_priv *priv = nft_expr_priv(expr); + struct xt_match *m = expr->ops->data; + int ret; + + priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL); + if (!priv->info) + return -ENOMEM; + + ret = __nft_match_init(ctx, expr, tb, priv->info); + if (ret) + kfree(priv->info); return ret; } static void -nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +__nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, + void *info) { struct xt_match *match = expr->ops->data; - void *info = nft_expr_priv(expr); struct xt_mtdtor_param par; par.net = ctx->net; @@ -450,13 +506,28 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) if (par.match->destroy != NULL) par.match->destroy(&par); - nft_xt_put(container_of(expr->ops, struct nft_xt, ops)); - module_put(match->me); + if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops))) + module_put(match->me); } -static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) +static void +nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + __nft_match_destroy(ctx, expr, nft_expr_priv(expr)); +} + +static void +nft_match_large_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + struct nft_xt_match_priv *priv = nft_expr_priv(expr); + + __nft_match_destroy(ctx, expr, priv->info); + kfree(priv->info); +} + +static int __nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr, + void *info) { - void *info = nft_expr_priv(expr); struct xt_match *match = expr->ops->data; if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) || @@ -470,6 +541,18 @@ nla_put_failure: return -1; } +static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + return __nft_match_dump(skb, expr, nft_expr_priv(expr)); +} + +static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e) +{ + struct nft_xt_match_priv *priv = nft_expr_priv(e); + + return __nft_match_dump(skb, e, priv->info); +} + static int nft_match_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) @@ -637,6 +720,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, { struct nft_xt *nft_match; struct xt_match *match; + unsigned int matchsize; char *mt_name; u32 rev, family; int err; @@ -654,13 +738,8 @@ nft_match_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_match, &nft_match_list, head) { struct xt_match *match = nft_match->ops.data; - if (nft_match_cmp(match, mt_name, rev, family)) { - if (!try_module_get(match->me)) - return ERR_PTR(-ENOENT); - - nft_match->refcnt++; + if (nft_match_cmp(match, mt_name, rev, family)) return &nft_match->ops; - } } match = xt_request_find_match(family, mt_name, rev); @@ -679,9 +758,8 @@ nft_match_select_ops(const struct nft_ctx *ctx, goto err; } - nft_match->refcnt = 1; + nft_match->refcnt = 0; nft_match->ops.type = &nft_match_type; - nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize)); nft_match->ops.eval = nft_match_eval; nft_match->ops.init = nft_match_init; nft_match->ops.destroy = nft_match_destroy; @@ -689,6 +767,18 @@ nft_match_select_ops(const struct nft_ctx *ctx, nft_match->ops.validate = nft_match_validate; nft_match->ops.data = match; + matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize)); + if (matchsize > NFT_MATCH_LARGE_THRESH) { + matchsize = NFT_EXPR_SIZE(sizeof(struct nft_xt_match_priv)); + + nft_match->ops.eval = nft_match_large_eval; + nft_match->ops.init = nft_match_large_init; + nft_match->ops.destroy = nft_match_large_destroy; + nft_match->ops.dump = nft_match_large_dump; + } + + nft_match->ops.size = matchsize; + list_add(&nft_match->head, &nft_match_list); return &nft_match->ops; @@ -739,13 +829,8 @@ nft_target_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_target, &nft_target_list, head) { struct xt_target *target = nft_target->ops.data; - if (nft_target_cmp(target, tg_name, rev, family)) { - if (!try_module_get(target->me)) - return ERR_PTR(-ENOENT); - - nft_target->refcnt++; + if (nft_target_cmp(target, tg_name, rev, family)) return &nft_target->ops; - } } target = xt_request_find_target(family, tg_name, rev); @@ -764,7 +849,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, goto err; } - nft_target->refcnt = 1; + nft_target->refcnt = 0; nft_target->ops.type = &nft_target_type; nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); nft_target->ops.init = nft_target_init; @@ -823,6 +908,32 @@ err_match: static void __exit nft_compat_module_exit(void) { + struct nft_xt *xt, *next; + + /* list should be empty here, it can be non-empty only in case there + * was an error that caused nft_xt expr to not be initialized fully + * and noone else requested the same expression later. + * + * In this case, the lists contain 0-refcount entries that still + * hold module reference. + */ + list_for_each_entry_safe(xt, next, &nft_target_list, head) { + struct xt_target *target = xt->ops.data; + + if (WARN_ON_ONCE(xt->refcnt)) + continue; + module_put(target->me); + kfree(xt); + } + + list_for_each_entry_safe(xt, next, &nft_match_list, head) { + struct xt_match *match = xt->ops.data; + + if (WARN_ON_ONCE(xt->refcnt)) + continue; + module_put(match->me); + kfree(xt); + } nfnetlink_subsys_unregister(&nfnl_compat_subsys); nft_unregister_expr(&nft_target_type); nft_unregister_expr(&nft_match_type); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index e235c17f1b8b..f0fc21f88775 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -25,6 +25,7 @@ struct nft_jhash { u32 modulus; u32 seed; u32 offset; + struct nft_set *map; }; static void nft_jhash_eval(const struct nft_expr *expr, @@ -35,14 +36,39 @@ static void nft_jhash_eval(const struct nft_expr *expr, const void *data = ®s->data[priv->sreg]; u32 h; - h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus); + h = reciprocal_scale(jhash(data, priv->len, priv->seed), + priv->modulus); + regs->data[priv->dreg] = h + priv->offset; } +static void nft_jhash_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_jhash *priv = nft_expr_priv(expr); + const void *data = ®s->data[priv->sreg]; + const struct nft_set *map = priv->map; + const struct nft_set_ext *ext; + u32 result; + bool found; + + result = reciprocal_scale(jhash(data, priv->len, priv->seed), + priv->modulus) + priv->offset; + + found = map->ops->lookup(nft_net(pkt), map, &result, &ext); + if (!found) + return; + + nft_data_copy(®s->data[priv->dreg], + nft_set_ext_data(ext), map->dlen); +} + struct nft_symhash { enum nft_registers dreg:8; u32 modulus; u32 offset; + struct nft_set *map; }; static void nft_symhash_eval(const struct nft_expr *expr, @@ -58,6 +84,28 @@ static void nft_symhash_eval(const struct nft_expr *expr, regs->data[priv->dreg] = h + priv->offset; } +static void nft_symhash_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_symhash *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + const struct nft_set *map = priv->map; + const struct nft_set_ext *ext; + u32 result; + bool found; + + result = reciprocal_scale(__skb_get_hash_symmetric(skb), + priv->modulus) + priv->offset; + + found = map->ops->lookup(nft_net(pkt), map, &result, &ext); + if (!found) + return; + + nft_data_copy(®s->data[priv->dreg], + nft_set_ext_data(ext), map->dlen); +} + static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SREG] = { .type = NLA_U32 }, [NFTA_HASH_DREG] = { .type = NLA_U32 }, @@ -66,6 +114,9 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SEED] = { .type = NLA_U32 }, [NFTA_HASH_OFFSET] = { .type = NLA_U32 }, [NFTA_HASH_TYPE] = { .type = NLA_U32 }, + [NFTA_HASH_SET_NAME] = { .type = NLA_STRING, + .len = NFT_SET_MAXNAMELEN - 1 }, + [NFTA_HASH_SET_ID] = { .type = NLA_U32 }, }; static int nft_jhash_init(const struct nft_ctx *ctx, @@ -115,6 +166,23 @@ static int nft_jhash_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } +static int nft_jhash_map_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_jhash *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); + + nft_jhash_init(ctx, expr, tb); + priv->map = nft_set_lookup_global(ctx->net, ctx->table, + tb[NFTA_HASH_SET_NAME], + tb[NFTA_HASH_SET_ID], genmask); + if (IS_ERR(priv->map)) + return PTR_ERR(priv->map); + + return 0; +} + static int nft_symhash_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -141,6 +209,23 @@ static int nft_symhash_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } +static int nft_symhash_map_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_jhash *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); + + nft_symhash_init(ctx, expr, tb); + priv->map = nft_set_lookup_global(ctx->net, ctx->table, + tb[NFTA_HASH_SET_NAME], + tb[NFTA_HASH_SET_ID], genmask); + if (IS_ERR(priv->map)) + return PTR_ERR(priv->map); + + return 0; +} + static int nft_jhash_dump(struct sk_buff *skb, const struct nft_expr *expr) { @@ -168,6 +253,18 @@ nla_put_failure: return -1; } +static int nft_jhash_map_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_jhash *priv = nft_expr_priv(expr); + + if (nft_jhash_dump(skb, expr) || + nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name)) + return -1; + + return 0; +} + static int nft_symhash_dump(struct sk_buff *skb, const struct nft_expr *expr) { @@ -188,6 +285,18 @@ nla_put_failure: return -1; } +static int nft_symhash_map_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_symhash *priv = nft_expr_priv(expr); + + if (nft_symhash_dump(skb, expr) || + nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name)) + return -1; + + return 0; +} + static struct nft_expr_type nft_hash_type; static const struct nft_expr_ops nft_jhash_ops = { .type = &nft_hash_type, @@ -197,6 +306,14 @@ static const struct nft_expr_ops nft_jhash_ops = { .dump = nft_jhash_dump, }; +static const struct nft_expr_ops nft_jhash_map_ops = { + .type = &nft_hash_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_jhash)), + .eval = nft_jhash_map_eval, + .init = nft_jhash_map_init, + .dump = nft_jhash_map_dump, +}; + static const struct nft_expr_ops nft_symhash_ops = { .type = &nft_hash_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)), @@ -205,6 +322,14 @@ static const struct nft_expr_ops nft_symhash_ops = { .dump = nft_symhash_dump, }; +static const struct nft_expr_ops nft_symhash_map_ops = { + .type = &nft_hash_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)), + .eval = nft_symhash_map_eval, + .init = nft_symhash_map_init, + .dump = nft_symhash_map_dump, +}; + static const struct nft_expr_ops * nft_hash_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -217,8 +342,12 @@ nft_hash_select_ops(const struct nft_ctx *ctx, type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE])); switch (type) { case NFT_HASH_SYM: + if (tb[NFTA_HASH_SET_NAME]) + return &nft_symhash_map_ops; return &nft_symhash_ops; case NFT_HASH_JENKINS: + if (tb[NFTA_HASH_SET_NAME]) + return &nft_jhash_map_ops; return &nft_jhash_ops; default: break; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 4717d7796927..aa87ff8beae8 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -69,8 +69,16 @@ err1: return err; } -static void nft_immediate_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +static void nft_immediate_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg)); +} + +static void nft_immediate_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); @@ -108,7 +116,8 @@ static const struct nft_expr_ops nft_imm_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), .eval = nft_immediate_eval, .init = nft_immediate_init, - .destroy = nft_immediate_destroy, + .activate = nft_immediate_activate, + .deactivate = nft_immediate_deactivate, .dump = nft_immediate_dump, .validate = nft_immediate_validate, }; diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 8a64db8f2e69..cdbc62a53933 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -166,18 +166,43 @@ struct nft_ng_random { enum nft_registers dreg:8; u32 modulus; u32 offset; + struct nft_set *map; }; +static u32 nft_ng_random_gen(struct nft_ng_random *priv) +{ + struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state); + + return reciprocal_scale(prandom_u32_state(state), priv->modulus) + + priv->offset; +} + static void nft_ng_random_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_ng_random *priv = nft_expr_priv(expr); - struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state); - u32 val; - val = reciprocal_scale(prandom_u32_state(state), priv->modulus); - regs->data[priv->dreg] = val + priv->offset; + regs->data[priv->dreg] = nft_ng_random_gen(priv); +} + +static void nft_ng_random_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_ng_random *priv = nft_expr_priv(expr); + const struct nft_set *map = priv->map; + const struct nft_set_ext *ext; + u32 result; + bool found; + + result = nft_ng_random_gen(priv); + found = map->ops->lookup(nft_net(pkt), map, &result, &ext); + if (!found) + return; + + nft_data_copy(®s->data[priv->dreg], + nft_set_ext_data(ext), map->dlen); } static int nft_ng_random_init(const struct nft_ctx *ctx, @@ -204,6 +229,23 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } +static int nft_ng_random_map_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ng_random *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); + + nft_ng_random_init(ctx, expr, tb); + priv->map = nft_set_lookup_global(ctx->net, ctx->table, + tb[NFTA_NG_SET_NAME], + tb[NFTA_NG_SET_ID], genmask); + if (IS_ERR(priv->map)) + return PTR_ERR(priv->map); + + return 0; +} + static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ng_random *priv = nft_expr_priv(expr); @@ -212,6 +254,22 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } +static int nft_ng_random_map_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_ng_random *priv = nft_expr_priv(expr); + + if (nft_ng_dump(skb, priv->dreg, priv->modulus, + NFT_NG_RANDOM, priv->offset) || + nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + static struct nft_expr_type nft_ng_type; static const struct nft_expr_ops nft_ng_inc_ops = { .type = &nft_ng_type, @@ -237,6 +295,14 @@ static const struct nft_expr_ops nft_ng_random_ops = { .dump = nft_ng_random_dump, }; +static const struct nft_expr_ops nft_ng_random_map_ops = { + .type = &nft_ng_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), + .eval = nft_ng_random_map_eval, + .init = nft_ng_random_map_init, + .dump = nft_ng_random_map_dump, +}; + static const struct nft_expr_ops * nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { @@ -255,6 +321,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) return &nft_ng_inc_map_ops; return &nft_ng_inc_ops; case NFT_NG_RANDOM: + if (tb[NFTA_NG_SET_NAME]) + return &nft_ng_random_map_ops; return &nft_ng_random_ops; } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 22c57d7612c4..d260ce2d6671 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -22,6 +22,7 @@ struct nft_rbtree { struct rb_root root; rwlock_t lock; seqcount_t count; + struct delayed_work gc_work; }; struct nft_rbtree_elem { @@ -265,6 +266,7 @@ static void nft_rbtree_activate(const struct net *net, struct nft_rbtree_elem *rbe = elem->priv; nft_set_elem_change_active(net, set, &rbe->ext); + nft_set_elem_clear_busy(&rbe->ext); } static bool nft_rbtree_flush(const struct net *net, @@ -272,8 +274,12 @@ static bool nft_rbtree_flush(const struct net *net, { struct nft_rbtree_elem *rbe = priv; - nft_set_elem_change_active(net, set, &rbe->ext); - return true; + if (!nft_set_elem_mark_busy(&rbe->ext) || + !nft_is_active(net, &rbe->ext)) { + nft_set_elem_change_active(net, set, &rbe->ext); + return true; + } + return false; } static void *nft_rbtree_deactivate(const struct net *net, @@ -347,6 +353,62 @@ cont: read_unlock_bh(&priv->lock); } +static void nft_rbtree_gc(struct work_struct *work) +{ + struct nft_set_gc_batch *gcb = NULL; + struct rb_node *node, *prev = NULL; + struct nft_rbtree_elem *rbe; + struct nft_rbtree *priv; + struct nft_set *set; + int i; + + priv = container_of(work, struct nft_rbtree, gc_work.work); + set = nft_set_container_of(priv); + + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + rbe = rb_entry(node, struct nft_rbtree_elem, node); + + if (nft_rbtree_interval_end(rbe)) { + prev = node; + continue; + } + if (!nft_set_elem_expired(&rbe->ext)) + continue; + if (nft_set_elem_mark_busy(&rbe->ext)) + continue; + + gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); + if (!gcb) + goto out; + + atomic_dec(&set->nelems); + nft_set_gc_batch_add(gcb, rbe); + + if (prev) { + rbe = rb_entry(prev, struct nft_rbtree_elem, node); + atomic_dec(&set->nelems); + nft_set_gc_batch_add(gcb, rbe); + } + node = rb_next(node); + } +out: + if (gcb) { + for (i = 0; i < gcb->head.cnt; i++) { + rbe = gcb->elems[i]; + rb_erase(&rbe->node, &priv->root); + } + } + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); + + nft_set_gc_batch_complete(gcb); + + queue_delayed_work(system_power_efficient_wq, &priv->gc_work, + nft_set_gc_interval(set)); +} + static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[], const struct nft_set_desc *desc) { @@ -362,6 +424,12 @@ static int nft_rbtree_init(const struct nft_set *set, rwlock_init(&priv->lock); seqcount_init(&priv->count); priv->root = RB_ROOT; + + INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc); + if (set->flags & NFT_SET_TIMEOUT) + queue_delayed_work(system_power_efficient_wq, &priv->gc_work, + nft_set_gc_interval(set)); + return 0; } @@ -371,6 +439,7 @@ static void nft_rbtree_destroy(const struct nft_set *set) struct nft_rbtree_elem *rbe; struct rb_node *node; + cancel_delayed_work_sync(&priv->gc_work); while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); @@ -395,7 +464,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, static struct nft_set_type nft_rbtree_type __read_mostly = { .owner = THIS_MODULE, - .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { .privsize = nft_rbtree_privsize, .elemsize = offsetof(struct nft_rbtree_elem, ext), diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 71325fef647d..cb7cb300c3bc 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -183,6 +183,9 @@ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) struct xt_match *m; int err = -ENOENT; + if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) + return ERR_PTR(-EINVAL); + mutex_lock(&xt[af].mutex); list_for_each_entry(m, &xt[af].match, list) { if (strcmp(m->name, name) == 0) { @@ -229,6 +232,9 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) struct xt_target *t; int err = -ENOENT; + if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN) + return ERR_PTR(-EINVAL); + mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt[af].target, list) { if (strcmp(t->name, name) == 0) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 611a26d5235c..2cc98c763003 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2871,13 +2871,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (skb == NULL) goto out_unlock; - skb_set_network_header(skb, reserve); + skb_reset_network_header(skb); err = -EINVAL; if (sock->type == SOCK_DGRAM) { offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (unlikely(offset < 0)) goto out_free; + } else if (reserve) { + skb_push(skb, reserve); } /* Returns -EFAULT on error */ diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 59d0eb960275..a7a4e6ff9be2 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -178,9 +178,10 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill) } static struct led_trigger rfkill_any_led_trigger; -static struct work_struct rfkill_any_work; +static struct led_trigger rfkill_none_led_trigger; +static struct work_struct rfkill_global_led_trigger_work; -static void rfkill_any_led_trigger_worker(struct work_struct *work) +static void rfkill_global_led_trigger_worker(struct work_struct *work) { enum led_brightness brightness = LED_OFF; struct rfkill *rfkill; @@ -195,30 +196,43 @@ static void rfkill_any_led_trigger_worker(struct work_struct *work) mutex_unlock(&rfkill_global_mutex); led_trigger_event(&rfkill_any_led_trigger, brightness); + led_trigger_event(&rfkill_none_led_trigger, + brightness == LED_OFF ? LED_FULL : LED_OFF); } -static void rfkill_any_led_trigger_event(void) +static void rfkill_global_led_trigger_event(void) { - schedule_work(&rfkill_any_work); + schedule_work(&rfkill_global_led_trigger_work); } -static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev) +static int rfkill_global_led_trigger_register(void) { - rfkill_any_led_trigger_event(); -} + int ret; + + INIT_WORK(&rfkill_global_led_trigger_work, + rfkill_global_led_trigger_worker); -static int rfkill_any_led_trigger_register(void) -{ - INIT_WORK(&rfkill_any_work, rfkill_any_led_trigger_worker); rfkill_any_led_trigger.name = "rfkill-any"; - rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate; - return led_trigger_register(&rfkill_any_led_trigger); + ret = led_trigger_register(&rfkill_any_led_trigger); + if (ret) + return ret; + + rfkill_none_led_trigger.name = "rfkill-none"; + ret = led_trigger_register(&rfkill_none_led_trigger); + if (ret) + led_trigger_unregister(&rfkill_any_led_trigger); + else + /* Delay activation until all global triggers are registered */ + rfkill_global_led_trigger_event(); + + return ret; } -static void rfkill_any_led_trigger_unregister(void) +static void rfkill_global_led_trigger_unregister(void) { + led_trigger_unregister(&rfkill_none_led_trigger); led_trigger_unregister(&rfkill_any_led_trigger); - cancel_work_sync(&rfkill_any_work); + cancel_work_sync(&rfkill_global_led_trigger_work); } #else static void rfkill_led_trigger_event(struct rfkill *rfkill) @@ -234,16 +248,16 @@ static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill) { } -static void rfkill_any_led_trigger_event(void) +static void rfkill_global_led_trigger_event(void) { } -static int rfkill_any_led_trigger_register(void) +static int rfkill_global_led_trigger_register(void) { return 0; } -static void rfkill_any_led_trigger_unregister(void) +static void rfkill_global_led_trigger_unregister(void) { } #endif /* CONFIG_RFKILL_LEDS */ @@ -354,7 +368,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) spin_unlock_irqrestore(&rfkill->lock, flags); rfkill_led_trigger_event(rfkill); - rfkill_any_led_trigger_event(); + rfkill_global_led_trigger_event(); if (prev != curr) rfkill_event(rfkill); @@ -535,7 +549,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked) spin_unlock_irqrestore(&rfkill->lock, flags); rfkill_led_trigger_event(rfkill); - rfkill_any_led_trigger_event(); + rfkill_global_led_trigger_event(); if (rfkill->registered && prev != blocked) schedule_work(&rfkill->uevent_work); @@ -579,7 +593,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked) schedule_work(&rfkill->uevent_work); rfkill_led_trigger_event(rfkill); - rfkill_any_led_trigger_event(); + rfkill_global_led_trigger_event(); return blocked; } @@ -629,7 +643,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw) schedule_work(&rfkill->uevent_work); rfkill_led_trigger_event(rfkill); - rfkill_any_led_trigger_event(); + rfkill_global_led_trigger_event(); } } EXPORT_SYMBOL(rfkill_set_states); @@ -1046,7 +1060,7 @@ int __must_check rfkill_register(struct rfkill *rfkill) #endif } - rfkill_any_led_trigger_event(); + rfkill_global_led_trigger_event(); rfkill_send_events(rfkill, RFKILL_OP_ADD); mutex_unlock(&rfkill_global_mutex); @@ -1079,7 +1093,7 @@ void rfkill_unregister(struct rfkill *rfkill) mutex_lock(&rfkill_global_mutex); rfkill_send_events(rfkill, RFKILL_OP_DEL); list_del_init(&rfkill->node); - rfkill_any_led_trigger_event(); + rfkill_global_led_trigger_event(); mutex_unlock(&rfkill_global_mutex); rfkill_led_trigger_unregister(rfkill); @@ -1332,7 +1346,7 @@ static int __init rfkill_init(void) if (error) goto error_misc; - error = rfkill_any_led_trigger_register(); + error = rfkill_global_led_trigger_register(); if (error) goto error_led_trigger; @@ -1346,7 +1360,7 @@ static int __init rfkill_init(void) #ifdef CONFIG_RFKILL_INPUT error_input: - rfkill_any_led_trigger_unregister(); + rfkill_global_led_trigger_unregister(); #endif error_led_trigger: misc_deregister(&rfkill_miscdev); @@ -1362,7 +1376,7 @@ static void __exit rfkill_exit(void) #ifdef CONFIG_RFKILL_INPUT rfkill_handler_exit(); #endif - rfkill_any_led_trigger_unregister(); + rfkill_global_led_trigger_unregister(); misc_deregister(&rfkill_miscdev); class_unregister(&rfkill_class); } diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 72251241665a..3f4cf930f809 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -77,9 +77,9 @@ static void free_tcf(struct tc_action *p) static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p) { - spin_lock_bh(&idrinfo->lock); + spin_lock(&idrinfo->lock); idr_remove(&idrinfo->action_idr, p->tcfa_index); - spin_unlock_bh(&idrinfo->lock); + spin_unlock(&idrinfo->lock); gen_kill_estimator(&p->tcfa_rate_est); free_tcf(p); } @@ -156,7 +156,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct tc_action *p; unsigned long id = 1; - spin_lock_bh(&idrinfo->lock); + spin_lock(&idrinfo->lock); s_i = cb->args[0]; @@ -191,7 +191,7 @@ done: if (index >= 0) cb->args[0] = index + 1; - spin_unlock_bh(&idrinfo->lock); + spin_unlock(&idrinfo->lock); if (n_i) { if (act_flags & TCA_FLAG_LARGE_DUMP_ON) cb->args[1] = n_i; @@ -261,9 +261,9 @@ static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo) { struct tc_action *p = NULL; - spin_lock_bh(&idrinfo->lock); + spin_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); - spin_unlock_bh(&idrinfo->lock); + spin_unlock(&idrinfo->lock); return p; } @@ -323,7 +323,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, } spin_lock_init(&p->tcfa_lock); idr_preload(GFP_KERNEL); - spin_lock_bh(&idrinfo->lock); + spin_lock(&idrinfo->lock); /* user doesn't specify an index */ if (!index) { index = 1; @@ -331,7 +331,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, } else { err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC); } - spin_unlock_bh(&idrinfo->lock); + spin_unlock(&idrinfo->lock); idr_preload_end(); if (err) goto err3; @@ -369,9 +369,9 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) { struct tcf_idrinfo *idrinfo = tn->idrinfo; - spin_lock_bh(&idrinfo->lock); + spin_lock(&idrinfo->lock); idr_replace(&idrinfo->action_idr, a, a->tcfa_index); - spin_unlock_bh(&idrinfo->lock); + spin_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_insert); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 853604685965..1fb39e1f9d07 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -161,6 +161,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, case htons(ETH_P_8021AD): break; default: + if (exists) + tcf_idr_release(*a, bind); return -EPROTONOSUPPORT; } } else { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ff3ce71aec93..760ab1b09f8b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -656,7 +656,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) if (__skb_array_empty(q)) continue; - skb = skb_array_consume_bh(q); + skb = __skb_array_consume(q); } if (likely(skb)) { qdisc_qstats_cpu_backlog_dec(qdisc, skb); @@ -697,7 +697,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc) if (!q->ring.queue) continue; - while ((skb = skb_array_consume_bh(q)) != NULL) + while ((skb = __skb_array_consume(q)) != NULL) kfree_skb(skb); } @@ -858,6 +858,11 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + /* seqlock has the same scope of busylock, for NOLOCK qdisc */ + spin_lock_init(&sch->seqlock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + seqcount_init(&sch->running); lockdep_set_class(&sch->running, dev->qdisc_running_key ?: &qdisc_running_key); @@ -1097,6 +1102,10 @@ static void dev_deactivate_queue(struct net_device *dev, qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { + bool nolock = qdisc->flags & TCQ_F_NOLOCK; + + if (nolock) + spin_lock_bh(&qdisc->seqlock); spin_lock_bh(qdisc_lock(qdisc)); if (!(qdisc->flags & TCQ_F_BUILTIN)) @@ -1106,6 +1115,8 @@ static void dev_deactivate_queue(struct net_device *dev, qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); + if (nolock) + spin_unlock_bh(&qdisc->seqlock); } } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 16644b3d2362..56c181c3feeb 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -222,10 +222,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, extack); if (IS_ERR(child)) return PTR_ERR(child); - } - if (child != &noop_qdisc) + /* child is fifo, no need to check for noop_qdisc */ qdisc_hash_add(child, true); + } + sch_tree_lock(sch); q->flags = ctl->flags; q->limit = ctl->limit; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 03225a8df973..6f74a426f159 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -383,6 +383,9 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, err = PTR_ERR(child); goto done; } + + /* child is fifo, no need to check for noop_qdisc */ + qdisc_hash_add(child, true); } sch_tree_lock(sch); @@ -391,8 +394,6 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; - if (child != &noop_qdisc) - qdisc_hash_add(child, true); } q->limit = qopt->limit; if (tb[TCA_TBF_PBURST]) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d15762b057c0..2c369d4bb1c1 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -8,8 +8,6 @@ * * Initial restrictions: * - support for alternate links postponed - * - partial support for non-blocking sockets only - * - support for urgent data postponed * * Copyright IBM Corp. 2016, 2018 * @@ -46,11 +44,6 @@ static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group * creation */ -struct smc_lgr_list smc_lgr_list = { /* established link groups */ - .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), - .list = LIST_HEAD_INIT(smc_lgr_list.list), -}; - static void smc_tcp_listen_work(struct work_struct *); static void smc_set_keepalive(struct sock *sk, int val) @@ -193,8 +186,10 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_protocol = protocol; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); + spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); @@ -380,10 +375,13 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) static void smc_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { - smc->conn.peer_conn_idx = clc->conn_idx; + int bufsize = smc_uncompress_bufsize(clc->rmbe_size); + + smc->conn.peer_rmbe_idx = clc->rmbe_idx; smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); - smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); + smc->conn.peer_rmbe_size = bufsize; atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); + smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } static void smc_link_save_peer_info(struct smc_link *link, @@ -396,165 +394,186 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->qp_mtu; } -/* setup for RDMA connection of client */ -static int smc_connect_rdma(struct smc_sock *smc) +/* fall back during connect */ +static int smc_connect_fallback(struct smc_sock *smc) { - struct smc_clc_msg_accept_confirm aclc; - int local_contact = SMC_FIRST_CONTACT; - struct smc_ib_device *smcibdev; - struct smc_link *link; - u8 srv_first_contact; - int reason_code = 0; - int rc = 0; - u8 ibport; - - sock_hold(&smc->sk); /* sock put in passive closing */ + smc->use_fallback = true; + smc_copy_sock_settings_to_clc(smc); + if (smc->sk.sk_state == SMC_INIT) + smc->sk.sk_state = SMC_ACTIVE; + return 0; +} - if (smc->use_fallback) - goto out_connected; +/* decline and fall back during connect */ +static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) +{ + int rc; - if (!tcp_sk(smc->clcsock->sk)->syn_smc) { - /* peer has not signalled SMC-capability */ - smc->use_fallback = true; - goto out_connected; + if (reason_code < 0) /* error, fallback is not possible */ + return reason_code; + if (reason_code != SMC_CLC_DECL_REPLY) { + rc = smc_clc_send_decline(smc, reason_code); + if (rc < 0) + return rc; } + return smc_connect_fallback(smc); +} - /* IPSec connections opt out of SMC-R optimizations */ - if (using_ipsec(smc)) { - reason_code = SMC_CLC_DECL_IPSEC; - goto decline_rdma; - } +/* abort connecting */ +static int smc_connect_abort(struct smc_sock *smc, int reason_code, + int local_contact) +{ + if (local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(smc->conn.lgr); + mutex_unlock(&smc_create_lgr_pending); + smc_conn_free(&smc->conn); + if (reason_code < 0 && smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return reason_code; +} + +/* check if there is a rdma device available for this connection. */ +/* called for connect and listen */ +static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, + u8 *ibport) +{ + int reason_code = 0; /* PNET table look up: search active ib_device and port * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ - smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); - if (!smcibdev) { + smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport); + if (!(*ibdev)) reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; - } + + return reason_code; +} + +/* CLC handshake during connect */ +static int smc_connect_clc(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_ib_device *ibdev, u8 ibport) +{ + int rc = 0; /* do inband token exchange */ - reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); - if (reason_code < 0) { - rc = reason_code; - goto out_err; - } - if (reason_code > 0) /* configuration error */ - goto decline_rdma; + rc = smc_clc_send_proposal(smc, ibdev, ibport); + if (rc) + return rc; /* receive SMC Accept CLC message */ - reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), - SMC_CLC_ACCEPT); - if (reason_code < 0) { - rc = reason_code; - goto out_err; - } - if (reason_code > 0) - goto decline_rdma; + return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT); +} + +/* setup for RDMA connection of client */ +static int smc_connect_rdma(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_ib_device *ibdev, u8 ibport) +{ + int local_contact = SMC_FIRST_CONTACT; + struct smc_link *link; + int reason_code = 0; - srv_first_contact = aclc.hdr.flag; mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl, - srv_first_contact); + local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl, + aclc->hdr.flag); if (local_contact < 0) { - rc = local_contact; - if (rc == -ENOMEM) + if (local_contact == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (rc == -ENOLINK) + else if (local_contact == -ENOLINK) reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ else reason_code = SMC_CLC_DECL_INTERR; /* other error */ - goto decline_rdma_unlock; + return smc_connect_abort(smc, reason_code, 0); } link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; - smc_conn_save_peer_info(smc, &aclc); + smc_conn_save_peer_info(smc, aclc); /* create send buffer and rmb */ - rc = smc_buf_create(smc); - if (rc) { - reason_code = SMC_CLC_DECL_MEM; - goto decline_rdma_unlock; - } + if (smc_buf_create(smc)) + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); if (local_contact == SMC_FIRST_CONTACT) - smc_link_save_peer_info(link, &aclc); + smc_link_save_peer_info(link, aclc); - rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); - if (rc) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } + if (smc_rmb_rtoken_handling(&smc->conn, aclc)) + return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + local_contact); smc_close_init(smc); smc_rx_init(smc); if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_ib_ready_link(link); - if (rc) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } + if (smc_ib_ready_link(link)) + return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + local_contact); } else { - if (!smc->conn.rmb_desc->reused) { - if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } - } + if (!smc->conn.rmb_desc->reused && + smc_reg_rmb(link, smc->conn.rmb_desc, true)) + return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + local_contact); } smc_rmb_sync_sg_for_device(&smc->conn); - rc = smc_clc_send_confirm(smc); - if (rc) - goto out_err_unlock; + reason_code = smc_clc_send_confirm(smc); + if (reason_code) + return smc_connect_abort(smc, reason_code, local_contact); + + smc_tx_init(smc); if (local_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ reason_code = smc_clnt_conf_first_link(smc); - if (reason_code < 0) { - rc = reason_code; - goto out_err_unlock; - } - if (reason_code > 0) - goto decline_rdma_unlock; + if (reason_code) + return smc_connect_abort(smc, reason_code, + local_contact); } - mutex_unlock(&smc_create_lgr_pending); - smc_tx_init(smc); -out_connected: smc_copy_sock_settings_to_clc(smc); if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; - return rc ? rc : local_contact; + return 0; +} -decline_rdma_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(smc->conn.lgr); - mutex_unlock(&smc_create_lgr_pending); - smc_conn_free(&smc->conn); -decline_rdma: - /* RDMA setup failed, switch back to TCP */ - smc->use_fallback = true; - if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(smc, reason_code); - if (rc < 0) - goto out_err; - } - goto out_connected; +/* perform steps before actually connecting */ +static int __smc_connect(struct smc_sock *smc) +{ + struct smc_clc_msg_accept_confirm aclc; + struct smc_ib_device *ibdev; + int rc = 0; + u8 ibport; -out_err_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(smc->conn.lgr); - mutex_unlock(&smc_create_lgr_pending); - smc_conn_free(&smc->conn); -out_err: - if (smc->sk.sk_state == SMC_INIT) - sock_put(&smc->sk); /* passive closing */ - return rc; + sock_hold(&smc->sk); /* sock put in passive closing */ + + if (smc->use_fallback) + return smc_connect_fallback(smc); + + /* if peer has not signalled SMC-capability, fall back */ + if (!tcp_sk(smc->clcsock->sk)->syn_smc) + return smc_connect_fallback(smc); + + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(smc)) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); + + /* check if a RDMA device is available; if not, fall back */ + if (smc_check_rdma(smc, &ibdev, &ibport)) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + + /* perform CLC handshake */ + rc = smc_connect_clc(smc, &aclc, ibdev, ibport); + if (rc) + return smc_connect_decline_fallback(smc, rc); + + /* connect using rdma */ + rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); + if (rc) + return smc_connect_decline_fallback(smc, rc); + + return 0; } static int smc_connect(struct socket *sock, struct sockaddr *addr, @@ -590,8 +609,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, if (rc) goto out; - /* setup RDMA connection */ - rc = smc_connect_rdma(smc); + rc = __smc_connect(smc); if (rc < 0) goto out; else @@ -789,182 +807,239 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) return 0; } -/* setup for RDMA connection of server */ -static void smc_listen_work(struct work_struct *work) +/* listen worker: finish */ +static void smc_listen_out(struct smc_sock *new_smc) { - struct smc_sock *new_smc = container_of(work, struct smc_sock, - smc_listen_work); - struct smc_clc_msg_proposal_prefix *pclc_prfx; - struct socket *newclcsock = new_smc->clcsock; struct smc_sock *lsmc = new_smc->listen_smc; - struct smc_clc_msg_accept_confirm cclc; - int local_contact = SMC_REUSE_CONTACT; struct sock *newsmcsk = &new_smc->sk; - struct smc_clc_msg_proposal *pclc; - struct smc_ib_device *smcibdev; - u8 buf[SMC_CLC_MAX_LEN]; - struct smc_link *link; - int reason_code = 0; - int rc = 0; - u8 ibport; - if (new_smc->use_fallback) - goto out_connected; - - /* check if peer is smc capable */ - if (!tcp_sk(newclcsock->sk)->syn_smc) { - new_smc->use_fallback = true; - goto out_connected; + lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); + if (lsmc->sk.sk_state == SMC_LISTEN) { + smc_accept_enqueue(&lsmc->sk, newsmcsk); + } else { /* no longer listening */ + smc_close_non_accepted(newsmcsk); } + release_sock(&lsmc->sk); - /* do inband token exchange - - *wait for and receive SMC Proposal CLC message - */ - reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), - SMC_CLC_PROPOSAL); - if (reason_code < 0) - goto out_err; - if (reason_code > 0) - goto decline_rdma; + /* Wake up accept */ + lsmc->sk.sk_data_ready(&lsmc->sk); + sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ +} - /* IPSec connections opt out of SMC-R optimizations */ - if (using_ipsec(new_smc)) { - reason_code = SMC_CLC_DECL_IPSEC; - goto decline_rdma; - } +/* listen worker: finish in state connected */ +static void smc_listen_out_connected(struct smc_sock *new_smc) +{ + struct sock *newsmcsk = &new_smc->sk; - /* PNET table look up: search active ib_device and port - * within same PNETID that also contains the ethernet device - * used for the internal TCP socket - */ - smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); - if (!smcibdev) { - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; + sk_refcnt_debug_inc(newsmcsk); + if (newsmcsk->sk_state == SMC_INIT) + newsmcsk->sk_state = SMC_ACTIVE; + + smc_listen_out(new_smc); +} + +/* listen worker: finish in error state */ +static void smc_listen_out_err(struct smc_sock *new_smc) +{ + struct sock *newsmcsk = &new_smc->sk; + + if (newsmcsk->sk_state == SMC_INIT) + sock_put(&new_smc->sk); /* passive closing */ + newsmcsk->sk_state = SMC_CLOSED; + smc_conn_free(&new_smc->conn); + + smc_listen_out(new_smc); +} + +/* listen worker: decline and fall back if possible */ +static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, + int local_contact) +{ + /* RDMA setup failed, switch back to TCP */ + if (local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(new_smc->conn.lgr); + if (reason_code < 0) { /* error, no fallback possible */ + smc_listen_out_err(new_smc); + return; + } + smc_conn_free(&new_smc->conn); + new_smc->use_fallback = true; + if (reason_code && reason_code != SMC_CLC_DECL_REPLY) { + if (smc_clc_send_decline(new_smc, reason_code) < 0) { + smc_listen_out_err(new_smc); + return; + } } + smc_listen_out_connected(new_smc); +} + +/* listen worker: check prefixes */ +static int smc_listen_rdma_check(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc) +{ + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct socket *newclcsock = new_smc->clcsock; - pclc = (struct smc_clc_msg_proposal *)&buf; pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (smc_clc_prfx_match(newclcsock, pclc_prfx)) + return SMC_CLC_DECL_CNFERR; - rc = smc_clc_prfx_match(newclcsock, pclc_prfx); - if (rc) { - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; - } + return 0; +} +/* listen worker: initialize connection and buffers */ +static int smc_listen_rdma_init(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_ib_device *ibdev, u8 ibport, + int *local_contact) +{ /* allocate connection / link group */ - mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl, - 0); - if (local_contact < 0) { - rc = local_contact; - if (rc == -ENOMEM) - reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - goto decline_rdma_unlock; + *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0); + if (*local_contact < 0) { + if (*local_contact == -ENOMEM) + return SMC_CLC_DECL_MEM;/* insufficient memory*/ + return SMC_CLC_DECL_INTERR; /* other error */ } - link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; /* create send buffer and rmb */ - rc = smc_buf_create(new_smc); - if (rc) { - reason_code = SMC_CLC_DECL_MEM; - goto decline_rdma_unlock; - } + if (smc_buf_create(new_smc)) + return SMC_CLC_DECL_MEM; - smc_close_init(new_smc); - smc_rx_init(new_smc); + return 0; +} + +/* listen worker: register buffers */ +static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) +{ + struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; if (local_contact != SMC_FIRST_CONTACT) { if (!new_smc->conn.rmb_desc->reused) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } + if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) + return SMC_CLC_DECL_INTERR; } } smc_rmb_sync_sg_for_device(&new_smc->conn); - rc = smc_clc_send_accept(new_smc, local_contact); - if (rc) - goto out_err_unlock; + return 0; +} + +/* listen worker: finish RDMA setup */ +static void smc_listen_rdma_finish(struct smc_sock *new_smc, + struct smc_clc_msg_accept_confirm *cclc, + int local_contact) +{ + struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + int reason_code = 0; - /* receive SMC Confirm CLC message */ - reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), - SMC_CLC_CONFIRM); - if (reason_code < 0) - goto out_err_unlock; - if (reason_code > 0) - goto decline_rdma_unlock; - smc_conn_save_peer_info(new_smc, &cclc); if (local_contact == SMC_FIRST_CONTACT) - smc_link_save_peer_info(link, &cclc); + smc_link_save_peer_info(link, cclc); - rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); - if (rc) { + if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; + goto decline; } if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_ib_ready_link(link); - if (rc) { + if (smc_ib_ready_link(link)) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; + goto decline; } /* QP confirmation over RoCE fabric */ reason_code = smc_serv_conf_first_link(new_smc); - if (reason_code < 0) - /* peer is not aware of a problem */ - goto out_err_unlock; - if (reason_code > 0) - goto decline_rdma_unlock; + if (reason_code) + goto decline; } + return; - smc_tx_init(new_smc); +decline: mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, reason_code, local_contact); +} -out_connected: - sk_refcnt_debug_inc(newsmcsk); - if (newsmcsk->sk_state == SMC_INIT) - newsmcsk->sk_state = SMC_ACTIVE; -enqueue: - lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); - if (lsmc->sk.sk_state == SMC_LISTEN) { - smc_accept_enqueue(&lsmc->sk, newsmcsk); - } else { /* no longer listening */ - smc_close_non_accepted(newsmcsk); +/* setup for RDMA connection of server */ +static void smc_listen_work(struct work_struct *work) +{ + struct smc_sock *new_smc = container_of(work, struct smc_sock, + smc_listen_work); + struct socket *newclcsock = new_smc->clcsock; + struct smc_clc_msg_accept_confirm cclc; + struct smc_clc_msg_proposal *pclc; + struct smc_ib_device *ibdev; + u8 buf[SMC_CLC_MAX_LEN]; + int local_contact = 0; + int reason_code = 0; + int rc = 0; + u8 ibport; + + if (new_smc->use_fallback) { + smc_listen_out_connected(new_smc); + return; } - release_sock(&lsmc->sk); - /* Wake up accept */ - lsmc->sk.sk_data_ready(&lsmc->sk); - sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ - return; + /* check if peer is smc capable */ + if (!tcp_sk(newclcsock->sk)->syn_smc) { + new_smc->use_fallback = true; + smc_listen_out_connected(new_smc); + return; + } -decline_rdma_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); - mutex_unlock(&smc_create_lgr_pending); -decline_rdma: - /* RDMA setup failed, switch back to TCP */ - smc_conn_free(&new_smc->conn); - new_smc->use_fallback = true; - if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - if (smc_clc_send_decline(new_smc, reason_code) < 0) - goto out_err; + /* do inband token exchange - + * wait for and receive SMC Proposal CLC message + */ + pclc = (struct smc_clc_msg_proposal *)&buf; + reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, + SMC_CLC_PROPOSAL); + if (reason_code) { + smc_listen_decline(new_smc, reason_code, 0); + return; } - goto out_connected; -out_err_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(new_smc)) { + smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0); + return; + } + + mutex_lock(&smc_create_lgr_pending); + smc_close_init(new_smc); + smc_rx_init(new_smc); + smc_tx_init(new_smc); + + /* check if RDMA is available */ + if (smc_check_rdma(new_smc, &ibdev, &ibport) || + smc_listen_rdma_check(new_smc, pclc) || + smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, + &local_contact) || + smc_listen_rdma_reg(new_smc, local_contact)) { + /* SMC not supported, decline */ + mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact); + return; + } + + /* send SMC Accept CLC message */ + rc = smc_clc_send_accept(new_smc, local_contact); + if (rc) { + mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, rc, local_contact); + return; + } + + /* receive SMC Confirm CLC message */ + reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), + SMC_CLC_CONFIRM); + if (reason_code) { + mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, reason_code, local_contact); + return; + } + + /* finish worker */ + smc_listen_rdma_finish(new_smc, &cclc, local_contact); + smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); -out_err: - if (newsmcsk->sk_state == SMC_INIT) - sock_put(&new_smc->sk); /* passive closing */ - newsmcsk->sk_state = SMC_CLOSED; - smc_conn_free(&new_smc->conn); - goto enqueue; /* queue new sock with sk_err set */ + smc_listen_out_connected(new_smc); } static void smc_tcp_listen_work(struct work_struct *work) @@ -1225,7 +1300,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, if (sk->sk_state == SMC_INIT && mask & EPOLLOUT && smc->clcsock->sk->sk_state != TCP_CLOSE) { - rc = smc_connect_rdma(smc); + rc = __smc_connect(smc); if (rc < 0) mask |= EPOLLERR; /* success cases including fallback */ @@ -1261,6 +1336,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, if (sk->sk_state == SMC_APPCLOSEWAIT1) mask |= EPOLLIN; } + if (smc->conn.urg_state == SMC_URG_VALID) + mask |= EPOLLPRI; } release_sock(sk); @@ -1400,10 +1477,13 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, static int smc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + union smc_host_cursor cons, urg; + struct smc_connection *conn; struct smc_sock *smc; int answ; smc = smc_sk(sock->sk); + conn = &smc->conn; if (smc->use_fallback) { if (!smc->clcsock) return -EBADF; @@ -1413,20 +1493,49 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, case SIOCINQ: /* same as FIONREAD */ if (smc->sk.sk_state == SMC_LISTEN) return -EINVAL; - answ = atomic_read(&smc->conn.bytes_to_rcv); + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) + answ = 0; + else + answ = atomic_read(&smc->conn.bytes_to_rcv); break; case SIOCOUTQ: /* output queue size (not send + not acked) */ if (smc->sk.sk_state == SMC_LISTEN) return -EINVAL; - answ = smc->conn.sndbuf_size - + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) + answ = 0; + else + answ = smc->conn.sndbuf_desc->len - atomic_read(&smc->conn.sndbuf_space); break; case SIOCOUTQNSD: /* output queue size (not send only) */ if (smc->sk.sk_state == SMC_LISTEN) return -EINVAL; - answ = smc_tx_prepared_sends(&smc->conn); + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) + answ = 0; + else + answ = smc_tx_prepared_sends(&smc->conn); + break; + case SIOCATMARK: + if (smc->sk.sk_state == SMC_LISTEN) + return -EINVAL; + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) { + answ = 0; + } else { + smc_curs_write(&cons, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + smc_curs_write(&urg, + smc_curs_read(&conn->urg_curs, conn), + conn); + answ = smc_curs_diff(conn->rmb_desc->len, + &cons, &urg) == 1; + } break; default: return -ENOIOCTLCMD; @@ -1635,19 +1744,7 @@ out_pnet: static void __exit smc_exit(void) { - struct smc_link_group *lgr, *lg; - LIST_HEAD(lgr_freeing_list); - - spin_lock_bh(&smc_lgr_list.lock); - if (!list_empty(&smc_lgr_list.list)) - list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); - spin_unlock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { - list_del_init(&lgr->list); - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); - cancel_delayed_work_sync(&lgr->free_work); - smc_lgr_free(lgr); /* free link group */ - } + smc_core_exit(); static_branch_disable(&tcp_have_smc); smc_ib_unregister_client(); sock_unregister(PF_SMC); diff --git a/net/smc/smc.h b/net/smc/smc.h index ec209cd48d42..51ae1f10d81a 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -114,11 +114,17 @@ struct smc_host_cdc_msg { /* Connection Data Control message */ u8 reserved[18]; } __aligned(8); +enum smc_urg_state { + SMC_URG_VALID, /* data present */ + SMC_URG_NOTYET, /* data pending */ + SMC_URG_READ /* data was already read */ +}; + struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ u32 alert_token_local; /* unique conn. id */ - u8 peer_conn_idx; /* from tcp handshake */ + u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ atomic_t peer_rmbe_space;/* remaining free bytes in peer * rmbe @@ -126,9 +132,7 @@ struct smc_connection { int rtoken_idx; /* idx to peer RMB rkey/addr */ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ - int sndbuf_size; /* sndbuf size <== sock wmem */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ - int rmbe_size; /* RMBE size <== sock rmem */ int rmbe_size_short;/* compressed notation */ int rmbe_update_limit; /* lower limit for consumer @@ -153,6 +157,7 @@ struct smc_connection { u16 tx_cdc_seq; /* sequence # for CDC send */ spinlock_t send_lock; /* protect wr_sends */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ + u32 tx_off; /* base offset in peer rmb */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt @@ -161,6 +166,15 @@ struct smc_connection { union smc_host_cursor rx_curs_confirmed; /* confirmed to peer * source of snd_una ? */ + union smc_host_cursor urg_curs; /* points at urgent byte */ + enum smc_urg_state urg_state; + bool urg_tx_pend; /* urgent data staged */ + bool urg_rx_skip_pend; + /* indicate urgent oob data + * read, but previous regular + * data still pending + */ + char urg_rx_byte; /* urgent byte */ atomic_t bytes_to_rcv; /* arrived data, * not yet received */ @@ -221,41 +235,6 @@ static inline u32 ntoh24(u8 *net) return be32_to_cpu(t); } -#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ - -#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */ -/* theoretically, the RFC states that largest size would be 512K, - * i.e. compressed 5 and thus 6 sizes (0..5), despite - * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) - */ - -/* convert the RMB size into the compressed notation - minimum 16K. - * In contrast to plain ilog2, this rounds towards the next power of 2, - * so the socket application gets at least its desired sndbuf / rcvbuf size. - */ -static inline u8 smc_compress_bufsize(int size) -{ - u8 compressed; - - if (size <= SMC_BUF_MIN_SIZE) - return 0; - - size = (size - 1) >> 14; - compressed = ilog2(size) + 1; - if (compressed >= SMC_RMBE_SIZES) - compressed = SMC_RMBE_SIZES - 1; - return compressed; -} - -/* convert the RMB size from compressed notation into integer */ -static inline int smc_uncompress_bufsize(u8 compressed) -{ - u32 size; - - size = 0x00000001 << (((int)compressed) + 14); - return (int)size; -} - #ifdef CONFIG_XFRM static inline bool using_ipsec(struct smc_sock *smc) { @@ -269,12 +248,6 @@ static inline bool using_ipsec(struct smc_sock *smc) } #endif -struct smc_clc_msg_local; - -void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, - struct smc_ib_device *smcibdev, u8 ibport, - struct smc_clc_msg_local *lcl, int srv_first_contact); struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); void smc_close_non_accepted(struct sock *sk); diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 42ad57365eca..a7e8d63fc8ae 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -44,13 +44,13 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, smc = container_of(cdcpend->conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); if (!wc_status) { - diff = smc_curs_diff(cdcpend->conn->sndbuf_size, + diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, &cdcpend->conn->tx_curs_fin, &cdcpend->cursor); /* sndbuf_space is decreased in smc_sendmsg */ smp_mb__before_atomic(); atomic_add(diff, &cdcpend->conn->sndbuf_space); - /* guarantee 0 <= sndbuf_space <= sndbuf_size */ + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); smc_curs_write(&cdcpend->conn->tx_curs_fin, smc_curs_read(&cdcpend->cursor, cdcpend->conn), @@ -164,20 +164,35 @@ static inline bool smc_cdc_before(u16 seq1, u16 seq2) return (s16)(seq1 - seq2) < 0; } +static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, + int *diff_prod) +{ + struct smc_connection *conn = &smc->conn; + char *base; + + /* new data included urgent business */ + smc_curs_write(&conn->urg_curs, + smc_curs_read(&conn->local_rx_ctrl.prod, conn), + conn); + conn->urg_state = SMC_URG_VALID; + if (!sock_flag(&smc->sk, SOCK_URGINLINE)) + /* we'll skip the urgent byte, so don't account for it */ + (*diff_prod)--; + base = (char *)conn->rmb_desc->cpu_addr; + if (conn->urg_curs.count) + conn->urg_rx_byte = *(base + conn->urg_curs.count - 1); + else + conn->urg_rx_byte = *(base + conn->rmb_desc->len - 1); + sk_send_sigurg(&smc->sk); +} + static void smc_cdc_msg_recv_action(struct smc_sock *smc, - struct smc_link *link, struct smc_cdc_msg *cdc) { union smc_host_cursor cons_old, prod_old; struct smc_connection *conn = &smc->conn; int diff_cons, diff_prod; - if (!cdc->prod_flags.failover_validation) { - if (smc_cdc_before(ntohs(cdc->seqno), - conn->local_rx_ctrl.seqno)) - /* received seqno is old */ - return; - } smc_curs_write(&prod_old, smc_curs_read(&conn->local_rx_ctrl.prod, conn), conn); @@ -198,18 +213,28 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, smp_mb__after_atomic(); } - diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old, + diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old, &conn->local_rx_ctrl.prod); if (diff_prod) { + if (conn->local_rx_ctrl.prod_flags.urg_data_present) + smc_cdc_handle_urg_data_arrival(smc, &diff_prod); /* bytes_to_rcv is decreased in smc_recvmsg */ smp_mb__before_atomic(); atomic_add(diff_prod, &conn->bytes_to_rcv); - /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ + /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */ smp_mb__after_atomic(); smc->sk.sk_data_ready(&smc->sk); - } else if ((conn->local_rx_ctrl.prod_flags.write_blocked) || - (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) { - smc->sk.sk_data_ready(&smc->sk); + } else { + if (conn->local_rx_ctrl.prod_flags.write_blocked || + conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || + conn->local_rx_ctrl.prod_flags.urg_data_pending) { + if (conn->local_rx_ctrl.prod_flags.urg_data_pending) + conn->urg_state = SMC_URG_NOTYET; + /* force immediate tx of current consumer cursor, but + * under send_lock to guarantee arrival in seqno-order + */ + smc_tx_sndbuf_nonempty(conn); + } } /* piggy backed tx info */ @@ -219,6 +244,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, /* trigger socket release if connection closed */ smc_close_wake_tx_prepared(smc); } + if (diff_cons && conn->urg_tx_pend && + atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) { + /* urg data confirmed by peer, indicate we're ready for more */ + conn->urg_tx_pend = false; + smc->sk.sk_write_space(&smc->sk); + } if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { smc->sk.sk_err = ECONNRESET; @@ -236,26 +267,11 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, } /* called under tasklet context */ -static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc, - struct smc_link *link, u64 wr_id) +static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) { - struct smc_link_group *lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); - struct smc_connection *connection; - struct smc_sock *smc; - - /* lookup connection */ - read_lock_bh(&lgr->conns_lock); - connection = smc_lgr_find_conn(ntohl(cdc->token), lgr); - if (!connection) { - read_unlock_bh(&lgr->conns_lock); - return; - } - smc = container_of(connection, struct smc_sock, conn); sock_hold(&smc->sk); - read_unlock_bh(&lgr->conns_lock); bh_lock_sock(&smc->sk); - smc_cdc_msg_recv_action(smc, link, cdc); + smc_cdc_msg_recv_action(smc, cdc); bh_unlock_sock(&smc->sk); sock_put(&smc->sk); /* no free sk in softirq-context */ } @@ -266,12 +282,31 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) { struct smc_link *link = (struct smc_link *)wc->qp->qp_context; struct smc_cdc_msg *cdc = buf; + struct smc_connection *conn; + struct smc_link_group *lgr; + struct smc_sock *smc; if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved)) return; /* short message */ if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ - smc_cdc_msg_recv(cdc, link, wc->wr_id); + + /* lookup connection */ + lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + read_lock_bh(&lgr->conns_lock); + conn = smc_lgr_find_conn(ntohl(cdc->token), lgr); + read_unlock_bh(&lgr->conns_lock); + if (!conn) + return; + smc = container_of(conn, struct smc_sock, conn); + + if (!cdc->prod_flags.failover_validation) { + if (smc_cdc_before(ntohs(cdc->seqno), + conn->local_rx_ctrl.seqno)) + /* received seqno is old */ + return; + } + smc_cdc_msg_recv(smc, cdc); } static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = { diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index d2012fd22100..f60082fee5b8 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -146,6 +146,19 @@ static inline int smc_curs_diff(unsigned int size, return max_t(int, 0, (new->count - old->count)); } +/* calculate cursor difference between old and new - returns negative + * value in case old > new + */ +static inline int smc_curs_comp(unsigned int size, + union smc_host_cursor *old, + union smc_host_cursor *new) +{ + if (old->wrap > new->wrap || + (old->wrap == new->wrap && old->count > new->count)) + return -smc_curs_diff(size, new, old); + return smc_curs_diff(size, old, new); +} + static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer, union smc_host_cursor *local, struct smc_connection *conn) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 236cb3f12c71..717449b1da0b 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -442,7 +442,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ + cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); cclc.rmbe_size = conn->rmbe_size_short; @@ -494,7 +494,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ + aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; aclc.rmbe_size = conn->rmbe_size_short, diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 63bf1dc2c1f9..41ff9ea96139 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -97,7 +97,7 @@ struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_local lcl; u8 qpn[3]; /* QP number */ __be32 rmb_rkey; /* RMB rkey */ - u8 conn_idx; /* Connection index, which RMBE in RMB */ + u8 rmbe_idx; /* Index of RMBE in RMB */ __be32 rmbe_alert_token;/* unique connection id */ #if defined(__BIG_ENDIAN_BITFIELD) u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 08c05cd0bbae..add82b0266f3 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -28,12 +28,16 @@ #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) -#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10) +#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) -static u32 smc_lgr_num; /* unique link group number */ +static struct smc_lgr_list smc_lgr_list = { /* established link groups */ + .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), + .list = LIST_HEAD_INIT(smc_lgr_list.list), + .num = 0, +}; -static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, - bool is_rmb); +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc); static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { @@ -181,8 +185,8 @@ static int smc_lgr_create(struct smc_sock *smc, INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } - smc_lgr_num += SMC_LGR_NUM_INCR; - memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE); + smc_lgr_list.num += SMC_LGR_NUM_INCR; + memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; @@ -236,26 +240,21 @@ out: static void smc_buf_unuse(struct smc_connection *conn) { - if (conn->sndbuf_desc) { + if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; - conn->sndbuf_size = 0; - } if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { conn->rmb_desc->reused = 1; conn->rmb_desc->used = 0; - conn->rmbe_size = 0; } else { /* buf registration failed, reuse not possible */ struct smc_link_group *lgr = conn->lgr; - struct smc_link *lnk; write_lock_bh(&lgr->rmbs_lock); list_del(&conn->rmb_desc->list); write_unlock_bh(&lgr->rmbs_lock); - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - smc_buf_free(conn->rmb_desc, lnk, true); + smc_buf_free(lgr, true, conn->rmb_desc); } } } @@ -281,9 +280,11 @@ static void smc_link_clear(struct smc_link *lnk) smc_wr_free_link_mem(lnk); } -static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, - bool is_rmb) +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + if (is_rmb) { if (buf_desc->mr_rx[SMC_SINGLE_LINK]) smc_ib_put_memory_region( @@ -302,7 +303,6 @@ static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; struct smc_buf_desc *buf_desc, *bf_desc; struct list_head *buf_list; int i; @@ -315,7 +315,7 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { list_del(&buf_desc->list); - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); } } } @@ -346,7 +346,7 @@ void smc_lgr_forget(struct smc_link_group *lgr) } /* terminate linkgroup abnormally */ -void smc_lgr_terminate(struct smc_link_group *lgr) +static void __smc_lgr_terminate(struct smc_link_group *lgr) { struct smc_connection *conn; struct smc_sock *smc; @@ -355,7 +355,8 @@ void smc_lgr_terminate(struct smc_link_group *lgr) if (lgr->terminating) return; /* lgr already terminating */ lgr->terminating = 1; - smc_lgr_forget(lgr); + if (!list_empty(&lgr->list)) /* forget lgr */ + list_del_init(&lgr->list); smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); write_lock_bh(&lgr->conns_lock); @@ -377,6 +378,27 @@ void smc_lgr_terminate(struct smc_link_group *lgr) smc_lgr_schedule_free_work(lgr); } +void smc_lgr_terminate(struct smc_link_group *lgr) +{ + spin_lock_bh(&smc_lgr_list.lock); + __smc_lgr_terminate(lgr); + spin_unlock_bh(&smc_lgr_list.lock); +} + +/* Called when IB port is terminated */ +void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *l; + + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { + if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && + lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) + __smc_lgr_terminate(lgr); + } + spin_unlock_bh(&smc_lgr_list.lock); +} + /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ @@ -461,10 +483,10 @@ int smc_conn_create(struct smc_sock *smc, struct smc_clc_msg_local *lcl, int srv_first_contact) { struct smc_connection *conn = &smc->conn; + int local_contact = SMC_FIRST_CONTACT; struct smc_link_group *lgr; unsigned short vlan_id; enum smc_lgr_role role; - int local_contact = SMC_FIRST_CONTACT; int rc = 0; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; @@ -522,6 +544,7 @@ create: } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; + conn->urg_state = SMC_URG_READ; #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); #endif @@ -530,14 +553,39 @@ out: return rc ? rc : local_contact; } +/* convert the RMB size into the compressed notation - minimum 16K. + * In contrast to plain ilog2, this rounds towards the next power of 2, + * so the socket application gets at least its desired sndbuf / rcvbuf size. + */ +static u8 smc_compress_bufsize(int size) +{ + u8 compressed; + + if (size <= SMC_BUF_MIN_SIZE) + return 0; + + size = (size - 1) >> 14; + compressed = ilog2(size) + 1; + if (compressed >= SMC_RMBE_SIZES) + compressed = SMC_RMBE_SIZES - 1; + return compressed; +} + +/* convert the RMB size from compressed notation into integer */ +int smc_uncompress_bufsize(u8 compressed) +{ + u32 size; + + size = 0x00000001 << (((int)compressed) + 14); + return (int)size; +} + /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ -static inline -struct smc_buf_desc *smc_buf_get_slot(struct smc_link_group *lgr, - int compressed_bufsize, - rwlock_t *lock, - struct list_head *buf_list) +static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, + rwlock_t *lock, + struct list_head *buf_list) { struct smc_buf_desc *buf_slot; @@ -589,7 +637,7 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, GFP_KERNEL); if (rc) { - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); } sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, @@ -600,7 +648,7 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ if (rc != 1) { - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(-EAGAIN); } @@ -611,19 +659,20 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, IB_ACCESS_LOCAL_WRITE, buf_desc); if (rc) { - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); } } + buf_desc->len = bufsize; return buf_desc; } static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) { + struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; - struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct list_head *buf_list; int bufsize, bufsize_short; int sk_buf_size; @@ -651,7 +700,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) continue; /* check for reusable slot in the link group */ - buf_desc = smc_buf_get_slot(lgr, bufsize_short, lock, buf_list); + buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); if (buf_desc) { memset(buf_desc->cpu_addr, 0, bufsize); break; /* found reusable slot */ @@ -675,14 +724,12 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; - conn->rmbe_size = bufsize; conn->rmbe_size_short = bufsize_short; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); } else { conn->sndbuf_desc = buf_desc; - conn->sndbuf_size = bufsize; smc->sk.sk_sndbuf = bufsize * 2; atomic_set(&conn->sndbuf_space, bufsize); } @@ -738,8 +785,7 @@ int smc_buf_create(struct smc_sock *smc) /* create rmb */ rc = __smc_buf_create(smc, true); if (rc) - smc_buf_free(smc->conn.sndbuf_desc, - &smc->conn.lgr->lnk[SMC_SINGLE_LINK], false); + smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); return rc; } @@ -806,3 +852,21 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, return conn->rtoken_idx; return 0; } + +/* Called (from smc_exit) when module is removed */ +void smc_core_exit(void) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_freeing_list); + + spin_lock_bh(&smc_lgr_list.lock); + if (!list_empty(&smc_lgr_list.list)) + list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); + spin_unlock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { + list_del_init(&lgr->list); + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + cancel_delayed_work_sync(&lgr->free_work); + smc_lgr_free(lgr); /* free link group */ + } +} diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 845dc073de13..93cb3523bf50 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -23,10 +23,9 @@ struct smc_lgr_list { /* list of link group definition */ struct list_head list; spinlock_t lock; /* protects list of link groups */ + u32 num; /* unique link group number */ }; -extern struct smc_lgr_list smc_lgr_list; /* list of link groups */ - enum smc_lgr_role { /* possible roles of a link group */ SMC_CLNT, /* client */ SMC_SERV /* server */ @@ -124,6 +123,7 @@ struct smc_buf_desc { struct list_head list; void *cpu_addr; /* virtual address of buffer */ struct page *pages; + int len; /* length of buffer */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */ struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; /* for rmb only: memory region @@ -141,6 +141,12 @@ struct smc_rtoken { /* address/key of remote RMB */ }; #define SMC_LGR_ID_SIZE 4 +#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ +#define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ +/* theoretically, the RFC states that largest size would be 512K, + * i.e. compressed 5 and thus 6 sizes (0..5), despite + * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) + */ struct smc_link_group { struct list_head list; @@ -205,11 +211,14 @@ static inline struct smc_connection *smc_lgr_find_conn( struct smc_sock; struct smc_clc_msg_accept_confirm; +struct smc_clc_msg_local; void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); +void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); int smc_buf_create(struct smc_sock *smc); +int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); @@ -218,4 +227,9 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); +void smc_conn_free(struct smc_connection *conn); +int smc_conn_create(struct smc_sock *smc, + struct smc_ib_device *smcibdev, u8 ibport, + struct smc_clc_msg_local *lcl, int srv_first_contact); +void smc_core_exit(void); #endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 05dd7e6d314d..839354402215 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -101,8 +101,9 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, struct smc_connection *conn = &smc->conn; struct smc_diag_conninfo cinfo = { .token = conn->alert_token_local, - .sndbuf_size = conn->sndbuf_size, - .rmbe_size = conn->rmbe_size, + .sndbuf_size = conn->sndbuf_desc ? + conn->sndbuf_desc->len : 0, + .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0, .peer_rmbe_size = conn->peer_rmbe_size, .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 26df554f7588..0eed7ab9f28b 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -143,17 +143,6 @@ out: return rc; } -static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) -{ - struct smc_link_group *lgr, *l; - - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) - smc_lgr_terminate(lgr); - } -} - /* process context wrapper for might_sleep smc_ib_remember_port_attr */ static void smc_ib_port_event_work(struct work_struct *work) { @@ -165,7 +154,7 @@ static void smc_ib_port_event_work(struct work_struct *work) smc_ib_remember_port_attr(smcibdev, port_idx + 1); clear_bit(port_idx, &smcibdev->port_event_mask); if (!smc_ib_port_active(smcibdev, port_idx + 1)) - smc_ib_port_terminate(smcibdev, port_idx + 1); + smc_port_terminate(smcibdev, port_idx + 1); } } diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 74568cdbca70..d7b88b2d1b22 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -245,40 +245,45 @@ out: static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem, struct nlattr *tb[]) { - char *string, *ibname = NULL; - int rc = 0; + char *string, *ibname; + int rc; memset(pnetelem, 0, sizeof(*pnetelem)); INIT_LIST_HEAD(&pnetelem->list); - if (tb[SMC_PNETID_NAME]) { - string = (char *)nla_data(tb[SMC_PNETID_NAME]); - if (!smc_pnetid_valid(string, pnetelem->pnet_name)) { - rc = -EINVAL; - goto error; - } - } - if (tb[SMC_PNETID_ETHNAME]) { - string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); - pnetelem->ndev = dev_get_by_name(net, string); - if (!pnetelem->ndev) - return -ENOENT; - } - if (tb[SMC_PNETID_IBNAME]) { - ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]); - ibname = strim(ibname); - pnetelem->smcibdev = smc_pnet_find_ib(ibname); - if (!pnetelem->smcibdev) { - rc = -ENOENT; - goto error; - } - } - if (tb[SMC_PNETID_IBPORT]) { - pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]); - if (pnetelem->ib_port > SMC_MAX_PORTS) { - rc = -EINVAL; - goto error; - } - } + + rc = -EINVAL; + if (!tb[SMC_PNETID_NAME]) + goto error; + string = (char *)nla_data(tb[SMC_PNETID_NAME]); + if (!smc_pnetid_valid(string, pnetelem->pnet_name)) + goto error; + + rc = -EINVAL; + if (!tb[SMC_PNETID_ETHNAME]) + goto error; + rc = -ENOENT; + string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); + pnetelem->ndev = dev_get_by_name(net, string); + if (!pnetelem->ndev) + goto error; + + rc = -EINVAL; + if (!tb[SMC_PNETID_IBNAME]) + goto error; + rc = -ENOENT; + ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]); + ibname = strim(ibname); + pnetelem->smcibdev = smc_pnet_find_ib(ibname); + if (!pnetelem->smcibdev) + goto error; + + rc = -EINVAL; + if (!tb[SMC_PNETID_IBPORT]) + goto error; + pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]); + if (pnetelem->ib_port < 1 || pnetelem->ib_port > SMC_MAX_PORTS) + goto error; + return 0; error: @@ -307,6 +312,8 @@ static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info) void *hdr; int rc; + if (!info->attrs[SMC_PNETID_NAME]) + return -EINVAL; pnetelem = smc_pnet_find_pnetid( (char *)nla_data(info->attrs[SMC_PNETID_NAME])); if (!pnetelem) @@ -359,6 +366,8 @@ static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) { + if (!info->attrs[SMC_PNETID_NAME]) + return -EINVAL; return smc_pnet_remove_by_pnetid( (char *)nla_data(info->attrs[SMC_PNETID_NAME])); } diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index ed45569289f5..3d77b383cccd 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -47,16 +47,59 @@ static void smc_rx_wake_up(struct sock *sk) * @conn connection to update * @cons consumer cursor * @len number of Bytes consumed + * Returns: + * 1 if we should end our receive, 0 otherwise */ -static void smc_rx_update_consumer(struct smc_connection *conn, - union smc_host_cursor cons, size_t len) +static int smc_rx_update_consumer(struct smc_sock *smc, + union smc_host_cursor cons, size_t len) { - smc_curs_add(conn->rmbe_size, &cons, len); + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + bool force = false; + int diff, rc = 0; + + smc_curs_add(conn->rmb_desc->len, &cons, len); + + /* did we process urgent data? */ + if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) { + diff = smc_curs_comp(conn->rmb_desc->len, &cons, + &conn->urg_curs); + if (sock_flag(sk, SOCK_URGINLINE)) { + if (diff == 0) { + force = true; + rc = 1; + conn->urg_state = SMC_URG_READ; + } + } else { + if (diff == 1) { + /* skip urgent byte */ + force = true; + smc_curs_add(conn->rmb_desc->len, &cons, 1); + conn->urg_rx_skip_pend = false; + } else if (diff < -1) + /* we read past urgent byte */ + conn->urg_state = SMC_URG_READ; + } + } + smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn), conn); + /* send consumer cursor update if required */ /* similar to advertising new TCP rcv_wnd if required */ - smc_tx_consumer_update(conn); + smc_tx_consumer_update(conn, force); + + return rc; +} + +static void smc_rx_update_cons(struct smc_sock *smc, size_t len) +{ + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons; + + smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + smc_rx_update_consumer(smc, cons, len); } struct smc_spd_priv { @@ -70,7 +113,6 @@ static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe, struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private; struct smc_sock *smc = priv->smc; struct smc_connection *conn; - union smc_host_cursor cons; struct sock *sk = &smc->sk; if (sk->sk_state == SMC_CLOSED || @@ -79,9 +121,7 @@ static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe, goto out; conn = &smc->conn; lock_sock(sk); - smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), - conn); - smc_rx_update_consumer(conn, cons, priv->len); + smc_rx_update_cons(smc, priv->len); release_sock(sk); if (atomic_sub_and_test(priv->len, &conn->splice_pending)) smc_rx_wake_up(sk); @@ -184,6 +224,52 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, return rc; } +static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, + int flags) +{ + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons; + struct sock *sk = &smc->sk; + int rc = 0; + + if (sock_flag(sk, SOCK_URGINLINE) || + !(conn->urg_state == SMC_URG_VALID) || + conn->urg_state == SMC_URG_READ) + return -EINVAL; + + if (conn->urg_state == SMC_URG_VALID) { + if (!(flags & MSG_PEEK)) + smc->conn.urg_state = SMC_URG_READ; + msg->msg_flags |= MSG_OOB; + if (len > 0) { + if (!(flags & MSG_TRUNC)) + rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1); + len = 1; + smc_curs_write(&cons, + smc_curs_read(&conn->local_tx_ctrl.cons, + conn), + conn); + if (smc_curs_diff(conn->rmb_desc->len, &cons, + &conn->urg_curs) > 1) + conn->urg_rx_skip_pend = true; + /* Urgent Byte was already accounted for, but trigger + * skipping the urgent byte in non-inline case + */ + if (!(flags & MSG_PEEK)) + smc_rx_update_consumer(smc, cons, 0); + } else { + msg->msg_flags |= MSG_TRUNC; + } + + return rc ? -EFAULT : len; + } + + if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN) + return 0; + + return -EAGAIN; +} + /* smc_rx_recvmsg - receive data from RMBE * @msg: copy data to receive buffer * @pipe: copy data to pipe if set - indicates splice() call @@ -209,12 +295,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (unlikely(flags & MSG_ERRQUEUE)) return -EINVAL; /* future work for sk.sk_family == AF_SMC */ - if (flags & MSG_OOB) - return -EINVAL; /* future work */ sk = &smc->sk; if (sk->sk_state == SMC_LISTEN) return -ENOTCONN; + if (flags & MSG_OOB) + return smc_rx_recv_urg(smc, msg, len, flags); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); @@ -227,6 +313,9 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (atomic_read(&conn->bytes_to_rcv)) goto copy; + else if (conn->urg_state == SMC_URG_VALID) + /* we received a single urgent Byte - skip */ + smc_rx_update_cons(smc, 0); if (sk->sk_shutdown & RCV_SHUTDOWN || smc_cdc_rxed_any_close_or_senddone(conn) || @@ -281,18 +370,22 @@ copy: continue; } - /* not more than what user space asked for */ - copylen = min_t(size_t, read_remaining, readable); smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), conn); /* subsequent splice() calls pick up where previous left */ if (splbytes) - smc_curs_add(conn->rmbe_size, &cons, splbytes); + smc_curs_add(conn->rmb_desc->len, &cons, splbytes); + if (conn->urg_state == SMC_URG_VALID && + sock_flag(&smc->sk, SOCK_URGINLINE) && + readable > 1) + readable--; /* always stop at urgent Byte */ + /* not more than what user space asked for */ + copylen = min_t(size_t, read_remaining, readable); /* determine chunks where to read from rcvbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ - chunk_len = min_t(size_t, - copylen, conn->rmbe_size - cons.count); + chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - + cons.count); chunk_len_sum = chunk_len; chunk_off = cons.count; smc_rmb_sync_sg_for_cpu(conn); @@ -331,10 +424,10 @@ copy: /* increased in recv tasklet smc_cdc_msg_rcv() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->bytes_to_rcv); - /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ + /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */ smp_mb__after_atomic(); - if (msg) - smc_rx_update_consumer(conn, cons, copylen); + if (msg && smc_rx_update_consumer(smc, cons, copylen)) + goto out; } } while (read_remaining); out: @@ -346,4 +439,5 @@ void smc_rx_init(struct smc_sock *smc) { smc->sk.sk_data_ready = smc_rx_wake_up; atomic_set(&smc->conn.splice_pending, 0); + smc->conn.urg_state = SMC_URG_READ; } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 58dfe0bd9d60..cee666400752 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -32,7 +32,7 @@ /***************************** sndbuf producer *******************************/ /* callback implementation for sk.sk_write_space() - * to wakeup sndbuf producers that blocked with smc_tx_wait_memory(). + * to wakeup sndbuf producers that blocked with smc_tx_wait(). * called under sk_socket lock. */ static void smc_tx_write_space(struct sock *sk) @@ -56,7 +56,7 @@ static void smc_tx_write_space(struct sock *sk) } } -/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory(). +/* Wakeup sndbuf producers that blocked with smc_tx_wait(). * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space(). */ void smc_tx_sndbuf_nonfull(struct smc_sock *smc) @@ -66,8 +66,10 @@ void smc_tx_sndbuf_nonfull(struct smc_sock *smc) smc->sk.sk_write_space(&smc->sk); } -/* blocks sndbuf producer until at least one byte of free space available */ -static int smc_tx_wait_memory(struct smc_sock *smc, int flags) +/* blocks sndbuf producer until at least one byte of free space available + * or urgent Byte was consumed + */ +static int smc_tx_wait(struct smc_sock *smc, int flags) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; @@ -103,14 +105,15 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags) break; } sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - if (atomic_read(&conn->sndbuf_space)) - break; /* at least 1 byte of free space available */ + if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend) + break; /* at least 1 byte of free & no urgent data */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk_wait_event(sk, &timeo, sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || smc_cdc_rxed_any_close(conn) || - atomic_read(&conn->sndbuf_space), + (atomic_read(&conn->sndbuf_space) && + !conn->urg_tx_pend), &wait); } remove_wait_queue(sk_sleep(sk), &wait); @@ -157,8 +160,11 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) if (smc_cdc_rxed_any_close(conn)) return send_done ?: -ECONNRESET; - if (!atomic_read(&conn->sndbuf_space)) { - rc = smc_tx_wait_memory(smc, msg->msg_flags); + if (msg->msg_flags & MSG_OOB) + conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; + + if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { + rc = smc_tx_wait(smc, msg->msg_flags); if (rc) { if (send_done) return send_done; @@ -168,7 +174,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) } /* initialize variables for 1st iteration of subsequent loop */ - /* could be just 1 byte, even after smc_tx_wait_memory above */ + /* could be just 1 byte, even after smc_tx_wait above */ writespace = atomic_read(&conn->sndbuf_space); /* not more than what user space asked for */ copylen = min_t(size_t, send_remaining, writespace); @@ -180,8 +186,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) tx_cnt_prep = prep.count; /* determine chunks where to write into sndbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ - chunk_len = min_t(size_t, - copylen, conn->sndbuf_size - tx_cnt_prep); + chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len - + tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; smc_sndbuf_sync_sg_for_cpu(conn); @@ -206,21 +212,23 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) } smc_sndbuf_sync_sg_for_device(conn); /* update cursors */ - smc_curs_add(conn->sndbuf_size, &prep, copylen); + smc_curs_add(conn->sndbuf_desc->len, &prep, copylen); smc_curs_write(&conn->tx_curs_prep, smc_curs_read(&prep, conn), conn); /* increased in send tasklet smc_cdc_tx_handler() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->sndbuf_space); - /* guarantee 0 <= sndbuf_space <= sndbuf_size */ + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); /* since we just produced more new data into sndbuf, * trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ + if ((msg->msg_flags & MSG_OOB) && !send_remaining) + conn->urg_tx_pend = true; if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && (atomic_read(&conn->sndbuf_space) > - (conn->sndbuf_size >> 1))) + (conn->sndbuf_desc->len >> 1))) /* for a corked socket defer the RDMA writes if there * is still sufficient sndbuf_space available */ @@ -261,7 +269,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, rdma_wr.remote_addr = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + /* RMBE within RMB */ - ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) + + conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; @@ -286,7 +294,7 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn, atomic_sub(len, &conn->peer_rmbe_space); /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ smp_mb__after_atomic(); - smc_curs_add(conn->sndbuf_size, sent, len); + smc_curs_add(conn->sndbuf_desc->len, sent, len); } /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; @@ -299,6 +307,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) union smc_host_cursor sent, prep, prod, cons; struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; struct smc_link_group *lgr = conn->lgr; + struct smc_cdc_producer_flags *pflags; int to_send, rmbespace; struct smc_link *link; dma_addr_t dma_addr; @@ -309,7 +318,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); /* cf. wmem_alloc - (snd_max - snd_una) */ - to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep); + to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); if (to_send <= 0) return 0; @@ -326,7 +335,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) conn); /* if usable snd_wnd closes ask peer to advertise once it opens again */ - conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace); + pflags = &conn->local_tx_ctrl.prod_flags; + pflags->write_blocked = (to_send >= rmbespace); /* cf. usable snd_wnd */ len = min(to_send, rmbespace); @@ -351,12 +361,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) dst_len_sum = dst_len; src_off = sent.count; /* dst_len determines the maximum src_len */ - if (sent.count + dst_len <= conn->sndbuf_size) { + if (sent.count + dst_len <= conn->sndbuf_desc->len) { /* unwrapped src case: single chunk of entire dst_len */ src_len = dst_len; } else { /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ - src_len = conn->sndbuf_size - sent.count; + src_len = conn->sndbuf_desc->len - sent.count; } src_len_sum = src_len; dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); @@ -368,8 +378,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; num_sges++; src_off += src_len; - if (src_off >= conn->sndbuf_size) - src_off -= conn->sndbuf_size; + if (src_off >= conn->sndbuf_desc->len) + src_off -= conn->sndbuf_desc->len; /* modulo in send ring */ if (src_len_sum == dst_len) break; /* either on 1st or 2nd iteration */ @@ -387,10 +397,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) dst_len = len - dst_len; /* remainder */ dst_len_sum += dst_len; src_len = min_t(int, - dst_len, conn->sndbuf_size - sent.count); + dst_len, conn->sndbuf_desc->len - sent.count); src_len_sum = src_len; } + if (conn->urg_tx_pend && len == to_send) + pflags->urg_data_present = 1; smc_tx_advance_cursors(conn, &prod, &sent, len); /* update connection's cursors with advanced local cursors */ smc_curs_write(&conn->local_tx_ctrl.prod, @@ -410,6 +422,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) */ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { + struct smc_cdc_producer_flags *pflags; struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; int rc; @@ -433,14 +446,21 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) goto out_unlock; } - rc = smc_tx_rdma_writes(conn); - if (rc) { - smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], - (struct smc_wr_tx_pend_priv *)pend); - goto out_unlock; + if (!conn->local_tx_ctrl.prod_flags.urg_data_present) { + rc = smc_tx_rdma_writes(conn); + if (rc) { + smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + (struct smc_wr_tx_pend_priv *)pend); + goto out_unlock; + } } rc = smc_cdc_msg_send(conn, wr_buf, pend); + pflags = &conn->local_tx_ctrl.prod_flags; + if (!rc && pflags->urg_data_present) { + pflags->urg_data_pending = 0; + pflags->urg_data_present = 0; + } out_unlock: spin_unlock_bh(&conn->send_lock); @@ -450,7 +470,7 @@ out_unlock: /* Wakeup sndbuf consumers from process context * since there is more data to transmit */ -static void smc_tx_work(struct work_struct *work) +void smc_tx_work(struct work_struct *work) { struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, @@ -473,7 +493,7 @@ out: release_sock(&smc->sk); } -void smc_tx_consumer_update(struct smc_connection *conn) +void smc_tx_consumer_update(struct smc_connection *conn, bool force) { union smc_host_cursor cfed, cons; int to_confirm; @@ -484,11 +504,12 @@ void smc_tx_consumer_update(struct smc_connection *conn) smc_curs_write(&cfed, smc_curs_read(&conn->rx_curs_confirmed, conn), conn); - to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons); + to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || + force || ((to_confirm > conn->rmbe_update_limit) && - ((to_confirm > (conn->rmbe_size / 2)) || + ((to_confirm > (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && conn->alert_token_local) { /* connection healthy */ @@ -512,6 +533,4 @@ void smc_tx_consumer_update(struct smc_connection *conn) void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; - INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); - spin_lock_init(&smc->conn.send_lock); } diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 78255964fa4d..9d2238909fa0 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -24,13 +24,14 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn) smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); - return smc_curs_diff(conn->sndbuf_size, &sent, &prep); + return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } +void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); -void smc_tx_consumer_update(struct smc_connection *conn); +void smc_tx_consumer_update(struct smc_connection *conn, bool force); #endif /* SMC_TX_H */ diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 5c3909c311f1..839e1e165a0c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -680,7 +680,6 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgin = &sgin_arr[0]; struct strp_msg *rxm = strp_msg(skb); int ret, nsg = ARRAY_SIZE(sgin_arr); - char aad_recv[TLS_AAD_SPACE_SIZE]; struct sk_buff *unused; ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, @@ -697,13 +696,13 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, } sg_init_table(sgin, nsg); - sg_set_buf(&sgin[0], aad_recv, sizeof(aad_recv)); + sg_set_buf(&sgin[0], ctx->rx_aad_ciphertext, TLS_AAD_SPACE_SIZE); nsg = skb_to_sgvec(skb, &sgin[1], rxm->offset + tls_ctx->rx.prepend_size, rxm->full_len - tls_ctx->rx.prepend_size); - tls_make_aad(aad_recv, + tls_make_aad(ctx->rx_aad_ciphertext, rxm->full_len - tls_ctx->rx.overhead_size, tls_ctx->rx.rec_seq, tls_ctx->rx.rec_seq_size, @@ -802,12 +801,12 @@ int tls_sw_recvmsg(struct sock *sk, if (to_copy <= len && page_count < MAX_SKB_FRAGS && likely(!(flags & MSG_PEEK))) { struct scatterlist sgin[MAX_SKB_FRAGS + 1]; - char unused[21]; int pages = 0; zc = true; sg_init_table(sgin, MAX_SKB_FRAGS + 1); - sg_set_buf(&sgin[0], unused, 13); + sg_set_buf(&sgin[0], ctx->rx_aad_plaintext, + TLS_AAD_SPACE_SIZE); err = zerocopy_from_iter(sk, &msg->msg_iter, to_copy, &pages, diff --git a/net/wireless/core.c b/net/wireless/core.c index c0fd8a85e7f7..5fe35aafdd9c 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -725,6 +725,10 @@ int wiphy_register(struct wiphy *wiphy) (!rdev->ops->set_pmk || !rdev->ops->del_pmk))) return -EINVAL; + if (WARN_ON(!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) && + rdev->ops->update_connect_params)) + return -EINVAL; + if (wiphy->addresses) memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a052693c2e85..bc40a783cb27 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4,6 +4,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation */ #include <linux/if.h> @@ -423,6 +424,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG }, [NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG }, + + [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 }, + [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 }, + [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, }; /* policy for the key attributes */ @@ -645,7 +650,43 @@ static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq, return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd); } -static int nl80211_msg_put_channel(struct sk_buff *msg, +static int nl80211_msg_put_wmm_rules(struct sk_buff *msg, + const struct ieee80211_reg_rule *rule) +{ + int j; + struct nlattr *nl_wmm_rules = + nla_nest_start(msg, NL80211_FREQUENCY_ATTR_WMM); + + if (!nl_wmm_rules) + goto nla_put_failure; + + for (j = 0; j < IEEE80211_NUM_ACS; j++) { + struct nlattr *nl_wmm_rule = nla_nest_start(msg, j); + + if (!nl_wmm_rule) + goto nla_put_failure; + + if (nla_put_u16(msg, NL80211_WMMR_CW_MIN, + rule->wmm_rule->client[j].cw_min) || + nla_put_u16(msg, NL80211_WMMR_CW_MAX, + rule->wmm_rule->client[j].cw_max) || + nla_put_u8(msg, NL80211_WMMR_AIFSN, + rule->wmm_rule->client[j].aifsn) || + nla_put_u8(msg, NL80211_WMMR_TXOP, + rule->wmm_rule->client[j].cot)) + goto nla_put_failure; + + nla_nest_end(msg, nl_wmm_rule); + } + nla_nest_end(msg, nl_wmm_rules); + + return 0; + +nla_put_failure: + return -ENOBUFS; +} + +static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, struct ieee80211_channel *chan, bool large) { @@ -721,12 +762,55 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, DBM_TO_MBM(chan->max_power))) goto nla_put_failure; + if (large) { + const struct ieee80211_reg_rule *rule = + freq_reg_info(wiphy, chan->center_freq); + + if (!IS_ERR(rule) && rule->wmm_rule) { + if (nl80211_msg_put_wmm_rules(msg, rule)) + goto nla_put_failure; + } + } + return 0; nla_put_failure: return -ENOBUFS; } +static bool nl80211_put_txq_stats(struct sk_buff *msg, + struct cfg80211_txq_stats *txqstats, + int attrtype) +{ + struct nlattr *txqattr; + +#define PUT_TXQVAL_U32(attr, memb) do { \ + if (txqstats->filled & BIT(NL80211_TXQ_STATS_ ## attr) && \ + nla_put_u32(msg, NL80211_TXQ_STATS_ ## attr, txqstats->memb)) \ + return false; \ + } while (0) + + txqattr = nla_nest_start(msg, attrtype); + if (!txqattr) + return false; + + PUT_TXQVAL_U32(BACKLOG_BYTES, backlog_bytes); + PUT_TXQVAL_U32(BACKLOG_PACKETS, backlog_packets); + PUT_TXQVAL_U32(FLOWS, flows); + PUT_TXQVAL_U32(DROPS, drops); + PUT_TXQVAL_U32(ECN_MARKS, ecn_marks); + PUT_TXQVAL_U32(OVERLIMIT, overlimit); + PUT_TXQVAL_U32(OVERMEMORY, overmemory); + PUT_TXQVAL_U32(COLLISIONS, collisions); + PUT_TXQVAL_U32(TX_BYTES, tx_bytes); + PUT_TXQVAL_U32(TX_PACKETS, tx_packets); + PUT_TXQVAL_U32(MAX_FLOWS, max_flows); + nla_nest_end(msg, txqattr); + +#undef PUT_TXQVAL_U32 + return true; +} + /* netlink command implementations */ struct key_parse { @@ -1631,7 +1715,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, chan = &sband->channels[i]; if (nl80211_msg_put_channel( - msg, chan, + msg, &rdev->wiphy, chan, state->split)) goto nla_put_failure; @@ -1926,6 +2010,28 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.nan_supported_bands)) goto nla_put_failure; + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_TXQS)) { + struct cfg80211_txq_stats txqstats = {}; + int res; + + res = rdev_get_txq_stats(rdev, NULL, &txqstats); + if (!res && + !nl80211_put_txq_stats(msg, &txqstats, + NL80211_ATTR_TXQ_STATS)) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_TXQ_LIMIT, + rdev->wiphy.txq_limit)) + goto nla_put_failure; + if (nla_put_u32(msg, NL80211_ATTR_TXQ_MEMORY_LIMIT, + rdev->wiphy.txq_memory_limit)) + goto nla_put_failure; + if (nla_put_u32(msg, NL80211_ATTR_TXQ_QUANTUM, + rdev->wiphy.txq_quantum)) + goto nla_put_failure; + } + /* done */ state->split_start = 0; break; @@ -2303,6 +2409,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u8 retry_short = 0, retry_long = 0; u32 frag_threshold = 0, rts_threshold = 0; u8 coverage_class = 0; + u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0; ASSERT_RTNL(); @@ -2509,10 +2616,38 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) changed |= WIPHY_PARAM_DYN_ACK; } + if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) { + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_TXQS)) + return -EOPNOTSUPP; + txq_limit = nla_get_u32( + info->attrs[NL80211_ATTR_TXQ_LIMIT]); + changed |= WIPHY_PARAM_TXQ_LIMIT; + } + + if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) { + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_TXQS)) + return -EOPNOTSUPP; + txq_memory_limit = nla_get_u32( + info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]); + changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT; + } + + if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) { + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_TXQS)) + return -EOPNOTSUPP; + txq_quantum = nla_get_u32( + info->attrs[NL80211_ATTR_TXQ_QUANTUM]); + changed |= WIPHY_PARAM_TXQ_QUANTUM; + } + if (changed) { u8 old_retry_short, old_retry_long; u32 old_frag_threshold, old_rts_threshold; u8 old_coverage_class; + u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum; if (!rdev->ops->set_wiphy_params) return -EOPNOTSUPP; @@ -2522,6 +2657,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) old_frag_threshold = rdev->wiphy.frag_threshold; old_rts_threshold = rdev->wiphy.rts_threshold; old_coverage_class = rdev->wiphy.coverage_class; + old_txq_limit = rdev->wiphy.txq_limit; + old_txq_memory_limit = rdev->wiphy.txq_memory_limit; + old_txq_quantum = rdev->wiphy.txq_quantum; if (changed & WIPHY_PARAM_RETRY_SHORT) rdev->wiphy.retry_short = retry_short; @@ -2533,6 +2671,12 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.rts_threshold = rts_threshold; if (changed & WIPHY_PARAM_COVERAGE_CLASS) rdev->wiphy.coverage_class = coverage_class; + if (changed & WIPHY_PARAM_TXQ_LIMIT) + rdev->wiphy.txq_limit = txq_limit; + if (changed & WIPHY_PARAM_TXQ_MEMORY_LIMIT) + rdev->wiphy.txq_memory_limit = txq_memory_limit; + if (changed & WIPHY_PARAM_TXQ_QUANTUM) + rdev->wiphy.txq_quantum = txq_quantum; result = rdev_set_wiphy_params(rdev, changed); if (result) { @@ -2541,6 +2685,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.frag_threshold = old_frag_threshold; rdev->wiphy.rts_threshold = old_rts_threshold; rdev->wiphy.coverage_class = old_coverage_class; + rdev->wiphy.txq_limit = old_txq_limit; + rdev->wiphy.txq_memory_limit = old_txq_memory_limit; + rdev->wiphy.txq_quantum = old_txq_quantum; return result; } } @@ -2662,6 +2809,16 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag } wdev_unlock(wdev); + if (rdev->ops->get_txq_stats) { + struct cfg80211_txq_stats txqstats = {}; + int ret = rdev_get_txq_stats(rdev, wdev, &txqstats); + + if (ret == 0 && + !nl80211_put_txq_stats(msg, &txqstats, + NL80211_ATTR_TXQ_STATS)) + goto nla_put_failure; + } + genlmsg_end(msg, hdr); return 0; @@ -4494,11 +4651,14 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO_U64(BEACON_RX, rx_beacon); PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); PUT_SINFO(ACK_SIGNAL, ack_signal, u8); + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT)) + PUT_SINFO(DATA_ACK_SIGNAL_AVG, avg_ack_signal, s8); #undef PUT_SINFO #undef PUT_SINFO_U64 - if (sinfo->filled & BIT(NL80211_STA_INFO_TID_STATS)) { + if (sinfo->pertid) { struct nlattr *tidsattr; int tid; @@ -4532,6 +4692,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_TIDVAL_U64(TX_MSDU_FAILED, tx_msdu_failed); #undef PUT_TIDVAL_U64 + if ((tidstats->filled & + BIT(NL80211_TID_STATS_TXQ_STATS)) && + !nl80211_put_txq_stats(msg, &tidstats->txq_stats, + NL80211_TID_STATS_TXQ_STATS)) + goto nla_put_failure; + nla_nest_end(msg, tidattr); } @@ -4545,10 +4711,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, sinfo->assoc_req_ies)) goto nla_put_failure; + cfg80211_sinfo_release_content(sinfo); genlmsg_end(msg, hdr); return 0; nla_put_failure: + cfg80211_sinfo_release_content(sinfo); genlmsg_cancel(msg, hdr); return -EMSGSIZE; } @@ -4630,8 +4798,10 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) + if (!msg) { + cfg80211_sinfo_release_content(&sinfo); return -ENOMEM; + } if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, info->snd_portid, info->snd_seq, 0, @@ -7930,7 +8100,15 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) wdev_lock(wdev); spin_lock_bh(&rdev->bss_lock); - cfg80211_bss_expire(rdev); + + /* + * dump_scan will be called multiple times to break up the scan results + * into multiple messages. It is unlikely that any more bss-es will be + * expired after the first call, so only call only call this on the + * first dump_scan invocation. + */ + if (start == 0) + cfg80211_bss_expire(rdev); cb->seq = rdev->bss_generation; @@ -8336,6 +8514,10 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) const u8 *bssid, *ssid; int err, ssid_len = 0; + if (dev->ieee80211_ptr->conn_owner_nlportid && + dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) + return -EPERM; + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) return -EINVAL; @@ -8458,6 +8640,10 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info) u16 reason_code; bool local_state_change; + if (dev->ieee80211_ptr->conn_owner_nlportid && + dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) + return -EPERM; + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) return -EINVAL; @@ -8505,6 +8691,10 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info) u16 reason_code; bool local_state_change; + if (dev->ieee80211_ptr->conn_owner_nlportid && + dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) + return -EPERM; + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) return -EINVAL; @@ -9251,6 +9441,8 @@ static int nl80211_update_connect_params(struct sk_buff *skb, struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; + bool fils_sk_offload; + u32 auth_type; u32 changed = 0; int ret; @@ -9265,6 +9457,56 @@ static int nl80211_update_connect_params(struct sk_buff *skb, changed |= UPDATE_ASSOC_IES; } + fils_sk_offload = wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_FILS_SK_OFFLOAD); + + /* + * when driver supports fils-sk offload all attributes must be + * provided. So the else covers "fils-sk-not-all" and + * "no-fils-sk-any". + */ + if (fils_sk_offload && + info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] && + info->attrs[NL80211_ATTR_FILS_ERP_REALM] && + info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] && + info->attrs[NL80211_ATTR_FILS_ERP_RRK]) { + connect.fils_erp_username = + nla_data(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]); + connect.fils_erp_username_len = + nla_len(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]); + connect.fils_erp_realm = + nla_data(info->attrs[NL80211_ATTR_FILS_ERP_REALM]); + connect.fils_erp_realm_len = + nla_len(info->attrs[NL80211_ATTR_FILS_ERP_REALM]); + connect.fils_erp_next_seq_num = + nla_get_u16( + info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM]); + connect.fils_erp_rrk = + nla_data(info->attrs[NL80211_ATTR_FILS_ERP_RRK]); + connect.fils_erp_rrk_len = + nla_len(info->attrs[NL80211_ATTR_FILS_ERP_RRK]); + changed |= UPDATE_FILS_ERP_INFO; + } else if (info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] || + info->attrs[NL80211_ATTR_FILS_ERP_REALM] || + info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] || + info->attrs[NL80211_ATTR_FILS_ERP_RRK]) { + return -EINVAL; + } + + if (info->attrs[NL80211_ATTR_AUTH_TYPE]) { + auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]); + if (!nl80211_valid_auth_type(rdev, auth_type, + NL80211_CMD_CONNECT)) + return -EINVAL; + + if (auth_type == NL80211_AUTHTYPE_FILS_SK && + fils_sk_offload && !(changed & UPDATE_FILS_ERP_INFO)) + return -EINVAL; + + connect.auth_type = auth_type; + changed |= UPDATE_AUTH_TYPE; + } + wdev_lock(dev->ieee80211_ptr); if (!wdev->current_bss) ret = -ENOLINK; @@ -9282,6 +9524,10 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info) u16 reason; int ret; + if (dev->ieee80211_ptr->conn_owner_nlportid && + dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) + return -EPERM; + if (!info->attrs[NL80211_ATTR_REASON_CODE]) reason = WLAN_REASON_DEAUTH_LEAVING; else @@ -14028,8 +14274,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, void *hdr; msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len + - cr->fils_kek_len + cr->pmk_len + - (cr->pmkid ? WLAN_PMKID_LEN : 0), gfp); + cr->fils.kek_len + cr->fils.pmk_len + + (cr->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp); if (!msg) return; @@ -14055,17 +14301,17 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, (cr->resp_ie && nla_put(msg, NL80211_ATTR_RESP_IE, cr->resp_ie_len, cr->resp_ie)) || - (cr->update_erp_next_seq_num && + (cr->fils.update_erp_next_seq_num && nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM, - cr->fils_erp_next_seq_num)) || + cr->fils.erp_next_seq_num)) || (cr->status == WLAN_STATUS_SUCCESS && - ((cr->fils_kek && - nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils_kek_len, - cr->fils_kek)) || - (cr->pmk && - nla_put(msg, NL80211_ATTR_PMK, cr->pmk_len, cr->pmk)) || - (cr->pmkid && - nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->pmkid))))) + ((cr->fils.kek && + nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils.kek_len, + cr->fils.kek)) || + (cr->fils.pmk && + nla_put(msg, NL80211_ATTR_PMK, cr->fils.pmk_len, cr->fils.pmk)) || + (cr->fils.pmkid && + nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->fils.pmkid))))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -14086,7 +14332,9 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, void *hdr; const u8 *bssid = info->bss ? info->bss->bssid : info->bssid; - msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len, gfp); + msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len + + info->fils.kek_len + info->fils.pmk_len + + (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp); if (!msg) return; @@ -14104,7 +14352,17 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, info->req_ie)) || (info->resp_ie && nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len, - info->resp_ie))) + info->resp_ie)) || + (info->fils.update_erp_next_seq_num && + nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM, + info->fils.erp_next_seq_num)) || + (info->fils.kek && + nla_put(msg, NL80211_ATTR_FILS_KEK, info->fils.kek_len, + info->fils.kek)) || + (info->fils.pmk && + nla_put(msg, NL80211_ATTR_PMK, info->fils.pmk_len, info->fils.pmk)) || + (info->fils.pmkid && + nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, info->fils.pmkid))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -14321,7 +14579,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy, nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE); if (!nl_freq) goto nla_put_failure; - if (nl80211_msg_put_channel(msg, channel_before, false)) + + if (nl80211_msg_put_channel(msg, wiphy, channel_before, false)) goto nla_put_failure; nla_nest_end(msg, nl_freq); @@ -14329,7 +14588,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy, nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER); if (!nl_freq) goto nla_put_failure; - if (nl80211_msg_put_channel(msg, channel_after, false)) + + if (nl80211_msg_put_channel(msg, wiphy, channel_after, false)) goto nla_put_failure; nla_nest_end(msg, nl_freq); @@ -14456,8 +14716,10 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, trace_cfg80211_del_sta(dev, mac_addr); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); - if (!msg) + if (!msg) { + cfg80211_sinfo_release_content(sinfo); return; + } if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0, rdev, dev, mac_addr, sinfo) < 0) { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 87479a53411b..364f5d67f05b 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -586,6 +586,18 @@ rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_get_txq_stats(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_txq_stats *txqstats) +{ + int ret; + trace_rdev_get_txq_stats(&rdev->wiphy, wdev); + ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev) { trace_rdev_rfkill_poll(&rdev->wiphy); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index ac3e12c32aa3..e55099b1785d 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1653,7 +1653,7 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator) case NL80211_REGDOM_SET_BY_DRIVER: return "driver"; case NL80211_REGDOM_SET_BY_COUNTRY_IE: - return "country IE"; + return "country element"; default: WARN_ON(1); return "bug"; @@ -2619,7 +2619,7 @@ reg_process_hint_country_ie(struct wiphy *wiphy, * This doesn't happen yet, not sure we * ever want to support it for this case. */ - WARN_ONCE(1, "Unexpected intersection for country IEs"); + WARN_ONCE(1, "Unexpected intersection for country elements"); return REG_REQ_IGNORE; } @@ -2769,6 +2769,21 @@ out_free: reg_free_request(reg_request); } +static void notify_self_managed_wiphys(struct regulatory_request *request) +{ + struct cfg80211_registered_device *rdev; + struct wiphy *wiphy; + + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + wiphy = &rdev->wiphy; + if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED && + request->initiator == NL80211_REGDOM_SET_BY_USER && + request->user_reg_hint_type == + NL80211_USER_REG_HINT_CELL_BASE) + reg_call_notifier(wiphy, request); + } +} + static bool reg_only_self_managed_wiphys(void) { struct cfg80211_registered_device *rdev; @@ -2820,6 +2835,7 @@ static void reg_process_pending_hints(void) spin_unlock(®_requests_lock); + notify_self_managed_wiphys(reg_request); if (reg_only_self_managed_wiphys()) { reg_free_request(reg_request); return; @@ -3384,7 +3400,7 @@ bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region) case NL80211_DFS_JP: return true; default: - pr_debug("Ignoring uknown DFS master region: %d\n", dfs_region); + pr_debug("Ignoring unknown DFS master region: %d\n", dfs_region); return false; } } @@ -3699,17 +3715,26 @@ EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync_rtnl); void wiphy_regulatory_register(struct wiphy *wiphy) { - struct regulatory_request *lr; + struct regulatory_request *lr = get_last_request(); - /* self-managed devices ignore external hints */ - if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) + /* self-managed devices ignore beacon hints and country IE */ + if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { wiphy->regulatory_flags |= REGULATORY_DISABLE_BEACON_HINTS | REGULATORY_COUNTRY_IE_IGNORE; + /* + * The last request may have been received before this + * registration call. Call the driver notifier if + * initiator is USER and user type is CELL_BASE. + */ + if (lr->initiator == NL80211_REGDOM_SET_BY_USER && + lr->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE) + reg_call_notifier(wiphy, lr); + } + if (!reg_dev_ignore_cell_hint(wiphy)) reg_num_devs_support_basehint++; - lr = get_last_request(); wiphy_update_regulatory(wiphy, lr->initiator); wiphy_all_share_dfs_chan_state(wiphy); } diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 5df6b33db786..d536b07582f8 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -803,8 +803,8 @@ void cfg80211_connect_done(struct net_device *dev, ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) + params->req_ie_len + params->resp_ie_len + - params->fils_kek_len + params->pmk_len + - (params->pmkid ? WLAN_PMKID_LEN : 0), gfp); + params->fils.kek_len + params->fils.pmk_len + + (params->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp); if (!ev) { cfg80211_put_bss(wdev->wiphy, params->bss); return; @@ -831,27 +831,29 @@ void cfg80211_connect_done(struct net_device *dev, params->resp_ie_len); next += params->resp_ie_len; } - if (params->fils_kek_len) { - ev->cr.fils_kek = next; - ev->cr.fils_kek_len = params->fils_kek_len; - memcpy((void *)ev->cr.fils_kek, params->fils_kek, - params->fils_kek_len); - next += params->fils_kek_len; + if (params->fils.kek_len) { + ev->cr.fils.kek = next; + ev->cr.fils.kek_len = params->fils.kek_len; + memcpy((void *)ev->cr.fils.kek, params->fils.kek, + params->fils.kek_len); + next += params->fils.kek_len; } - if (params->pmk_len) { - ev->cr.pmk = next; - ev->cr.pmk_len = params->pmk_len; - memcpy((void *)ev->cr.pmk, params->pmk, params->pmk_len); - next += params->pmk_len; + if (params->fils.pmk_len) { + ev->cr.fils.pmk = next; + ev->cr.fils.pmk_len = params->fils.pmk_len; + memcpy((void *)ev->cr.fils.pmk, params->fils.pmk, + params->fils.pmk_len); + next += params->fils.pmk_len; } - if (params->pmkid) { - ev->cr.pmkid = next; - memcpy((void *)ev->cr.pmkid, params->pmkid, WLAN_PMKID_LEN); + if (params->fils.pmkid) { + ev->cr.fils.pmkid = next; + memcpy((void *)ev->cr.fils.pmkid, params->fils.pmkid, + WLAN_PMKID_LEN); next += WLAN_PMKID_LEN; } - ev->cr.update_erp_next_seq_num = params->update_erp_next_seq_num; - if (params->update_erp_next_seq_num) - ev->cr.fils_erp_next_seq_num = params->fils_erp_next_seq_num; + ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num; + if (params->fils.update_erp_next_seq_num) + ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num; if (params->bss) cfg80211_hold_bss(bss_from_pub(params->bss)); ev->cr.bss = params->bss; @@ -930,6 +932,7 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; + u8 *next; if (!info->bss) { info->bss = cfg80211_get_bss(wdev->wiphy, info->channel, @@ -942,19 +945,52 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, if (WARN_ON(!info->bss)) return; - ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len, gfp); + ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len + + info->fils.kek_len + info->fils.pmk_len + + (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp); if (!ev) { cfg80211_put_bss(wdev->wiphy, info->bss); return; } ev->type = EVENT_ROAMED; - ev->rm.req_ie = ((u8 *)ev) + sizeof(*ev); - ev->rm.req_ie_len = info->req_ie_len; - memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len); - ev->rm.resp_ie = ((u8 *)ev) + sizeof(*ev) + info->req_ie_len; - ev->rm.resp_ie_len = info->resp_ie_len; - memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len); + next = ((u8 *)ev) + sizeof(*ev); + if (info->req_ie_len) { + ev->rm.req_ie = next; + ev->rm.req_ie_len = info->req_ie_len; + memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len); + next += info->req_ie_len; + } + if (info->resp_ie_len) { + ev->rm.resp_ie = next; + ev->rm.resp_ie_len = info->resp_ie_len; + memcpy((void *)ev->rm.resp_ie, info->resp_ie, + info->resp_ie_len); + next += info->resp_ie_len; + } + if (info->fils.kek_len) { + ev->rm.fils.kek = next; + ev->rm.fils.kek_len = info->fils.kek_len; + memcpy((void *)ev->rm.fils.kek, info->fils.kek, + info->fils.kek_len); + next += info->fils.kek_len; + } + if (info->fils.pmk_len) { + ev->rm.fils.pmk = next; + ev->rm.fils.pmk_len = info->fils.pmk_len; + memcpy((void *)ev->rm.fils.pmk, info->fils.pmk, + info->fils.pmk_len); + next += info->fils.pmk_len; + } + if (info->fils.pmkid) { + ev->rm.fils.pmkid = next; + memcpy((void *)ev->rm.fils.pmkid, info->fils.pmkid, + WLAN_PMKID_LEN); + next += WLAN_PMKID_LEN; + } + ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num; + if (info->fils.update_erp_next_seq_num) + ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num; ev->rm.bss = info->bss; spin_lock_irqsave(&wdev->event_lock, flags); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 55fb279a5196..2b417a2fe63f 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3243,6 +3243,20 @@ TRACE_EVENT(rdev_set_multicast_to_unicast, WIPHY_PR_ARG, NETDEV_PR_ARG, BOOL_TO_STR(__entry->enabled)) ); + +TRACE_EVENT(rdev_get_txq_stats, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), + TP_ARGS(wiphy, wdev), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index d112e9a89364..b5bb1c309914 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1787,6 +1787,17 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, return false; } +int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) +{ + sinfo->pertid = kcalloc(sizeof(*(sinfo->pertid)), + IEEE80211_NUM_TIDS + 1, gfp); + if (!sinfo->pertid) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats); + /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = |
