aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/devmem.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-26 21:48:21 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-26 21:48:21 -0700
commit1a9239bb4253f9076b5b4b2a1a4e8d7defd77a95 (patch)
tree286dda5e84757594218e684b94b01b3a3cac15a2 /net/core/devmem.c
parentMerge tag 'zstd-linus-v6.15-rc1' of https://github.com/terrelln/linux (diff)
parentMerge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net (diff)
downloadlinux-1a9239bb4253f9076b5b4b2a1a4e8d7defd77a95.tar.gz
linux-1a9239bb4253f9076b5b4b2a1a4e8d7defd77a95.zip
Merge tag 'net-next-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Continue Netlink conversions to per-namespace RTNL lock (IPv4 routing, routing rules, routing next hops, ARP ioctls) - Continue extending the use of netdev instance locks. As a driver opt-in protect queue operations and (in due course) ethtool operations with the instance lock and not RTNL lock. - Support collecting TCP timestamps (data submitted, sent, acked) in BPF, allowing for transparent (to the application) and lower overhead tracking of TCP RPC performance. - Tweak existing networking Rx zero-copy infra to support zero-copy Rx via io_uring. - Optimize MPTCP performance in single subflow mode by 29%. - Enable GRO on packets which went thru XDP CPU redirect (were queued for processing on a different CPU). Improving TCP stream performance up to 2x. - Improve performance of contended connect() by 200% by searching for an available 4-tuple under RCU rather than a spin lock. Bring an additional 229% improvement by tweaking hash distribution. - Avoid unconditionally touching sk_tsflags on RX, improving performance under UDP flood by as much as 10%. - Avoid skb_clone() dance in ping_rcv() to improve performance under ping flood. - Avoid FIB lookup in netfilter if socket is available, 20% perf win. - Rework network device creation (in-kernel) API to more clearly identify network namespaces and their roles. There are up to 4 namespace roles but we used to have just 2 netns pointer arguments, interpreted differently based on context. - Use sysfs_break_active_protection() instead of trylock to avoid deadlocks between unregistering objects and sysfs access. - Add a new sysctl and sockopt for capping max retransmit timeout in TCP. - Support masking port and DSCP in routing rule matches. - Support dumping IPv4 multicast addresses with RTM_GETMULTICAST. - Support specifying at what time packet should be sent on AF_XDP sockets. - Expose TCP ULP diagnostic info (for TLS and MPTCP) to non-admin users. - Add Netlink YAML spec for WiFi (nl80211) and conntrack. - Introduce EXPORT_IPV6_MOD() and EXPORT_IPV6_MOD_GPL() for symbols which only need to be exported when IPv6 support is built as a module. - Age FDB entries based on Rx not Tx traffic in VxLAN, similar to normal bridging. - Allow users to specify source port range for GENEVE tunnels. - netconsole: allow attaching kernel release, CPU ID and task name to messages as metadata Driver API: - Continue rework / fixing of Energy Efficient Ethernet (EEE) across the SW layers. Delegate the responsibilities to phylink where possible. Improve its handling in phylib. - Support symmetric OR-XOR RSS hashing algorithm. - Support tracking and preserving IRQ affinity by NAPI itself. - Support loopback mode speed selection for interface selftests. Device drivers: - Remove the IBM LCS driver for s390 - Remove the sb1000 cable modem driver - Add support for SFP module access over SMBus - Add MCTP transport driver for MCTP-over-USB - Enable XDP metadata support in multiple drivers - Ethernet high-speed NICs: - Broadcom (bnxt): - add PCIe TLP Processing Hints (TPH) support for new AMD platforms - support dumping RoCE queue state for debug - opt into instance locking - Intel (100G, ice, idpf): - ice: rework MSI-X IRQ management and distribution - ice: support for E830 devices - iavf: add support for Rx timestamping - iavf: opt into instance locking - nVidia/Mellanox: - mlx4: use page pool memory allocator for Rx - mlx5: support for one PTP device per hardware clock - mlx5: support for 200Gbps per-lane link modes - mlx5: move IPSec policy check after decryption - AMD/Solarflare: - support FW flashing via devlink - Cisco (enic): - use page pool memory allocator for Rx - enable 32, 64 byte CQEs - get max rx/tx ring size from the device - Meta (fbnic): - support flow steering and RSS configuration - report queue stats - support TCP segmentation - support IRQ coalescing - support ring size configuration - Marvell/Cavium: - support AF_XDP - Wangxun: - support for PTP clock and timestamping - Huawei (hibmcge): - checksum offload - add more statistics - Ethernet virtual: - VirtIO net: - aggressively suppress Tx completions, improve perf by 96% with 1 CPU and 55% with 2 CPUs - expose NAPI to IRQ mapping and persist NAPI settings - Google (gve): - support XDP in DQO RDA Queue Format - opt into instance locking - Microsoft vNIC: - support BIG TCP - Ethernet NICs consumer, and embedded: - Synopsys (stmmac): - cleanup Tx and Tx clock setting and other link-focused cleanups - enable SGMII and 2500BASEX mode switching for Intel platforms - support Sophgo SG2044 - Broadcom switches (b53): - support for BCM53101 - TI: - iep: add perout configuration support - icssg: support XDP - Cadence (macb): - implement BQL - Xilinx (axinet): - support dynamic IRQ moderation and changing coalescing at runtime - implement BQL - report standard stats - MediaTek: - support phylink managed EEE - Intel: - igc: don't restart the interface on every XDP program change - RealTek (r8169): - support reading registers of internal PHYs directly - increase max jumbo packet size on RTL8125/RTL8126 - Airoha: - support for RISC-V NPU packet processing unit - enable scatter-gather and support MTU up to 9kB - Tehuti (tn40xx): - support cards with TN4010 MAC and an Aquantia AQR105 PHY - Ethernet PHYs: - support for TJA1102S, TJA1121 - dp83tg720: add randomized polling intervals for link detection - dp83822: support changing the transmit amplitude voltage - support for LEDs on 88q2xxx - CAN: - canxl: support Remote Request Substitution bit access - flexcan: add S32G2/S32G3 SoC - WiFi: - remove cooked monitor support - strict mode for better AP testing - basic EPCS support - OMI RX bandwidth reduction support - batman-adv: add support for jumbo frames - WiFi drivers: - RealTek (rtw88): - support RTL8814AE and RTL8814AU - RealTek (rtw89): - switch using wiphy_lock and wiphy_work - add BB context to manipulate two PHY as preparation of MLO - improve BT-coexistence mechanism to play A2DP smoothly - Intel (iwlwifi): - add new iwlmld sub-driver for latest HW/FW combinations - MediaTek (mt76): - preparation for mt7996 Multi-Link Operation (MLO) support - Qualcomm/Atheros (ath12k): - continued work on MLO - Silabs (wfx): - Wake-on-WLAN support - Bluetooth: - add support for skb TX SND/COMPLETION timestamping - hci_core: enable buffer flow control for SCO/eSCO - coredump: log devcd dumps into the monitor - Bluetooth drivers: - intel: add support to configure TX power - nxp: handle bootloader error during cmd5 and cmd7" * tag 'net-next-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1681 commits) unix: fix up for "apparmor: add fine grained af_unix mediation" mctp: Fix incorrect tx flow invalidation condition in mctp-i2c net: usb: asix: ax88772: Increase phy_name size net: phy: Introduce PHY_ID_SIZE — minimum size for PHY ID string net: libwx: fix Tx L4 checksum net: libwx: fix Tx descriptor content for some tunnel packets atm: Fix NULL pointer dereference net: tn40xx: add pci-id of the aqr105-based Tehuti TN4010 cards net: tn40xx: prepare tn40xx driver to find phy of the TN9510 card net: tn40xx: create swnode for mdio and aqr105 phy and add to mdiobus net: phy: aquantia: add essential functions to aqr105 driver net: phy: aquantia: search for firmware-name in fwnode net: phy: aquantia: add probe function to aqr105 for firmware loading net: phy: Add swnode support to mdiobus_scan gve: add XDP DROP and PASS support for DQ gve: update XDP allocation path support RX buffer posting gve: merge packet buffer size fields gve: update GQ RX to use buf_size gve: introduce config-based allocation for XDP gve: remove xdp_xsk_done and xdp_xsk_wakeup statistics ...
Diffstat (limited to 'net/core/devmem.c')
-rw-r--r--net/core/devmem.c94
1 files changed, 59 insertions, 35 deletions
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 0e5a2c672efd..ee145a2aa41c 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -16,6 +16,7 @@
#include <net/netdev_queues.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
#include <trace/events/page_pool.h>
#include "devmem.h"
@@ -24,23 +25,30 @@
/* Device memory support */
-/* Protected by rtnl_lock() */
static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
+static const struct memory_provider_ops dmabuf_devmem_ops;
+
+bool net_is_devmem_iov(struct net_iov *niov)
+{
+ return niov->pp->mp_ops == &dmabuf_devmem_ops;
+}
+
static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
struct gen_pool_chunk *chunk,
void *not_used)
{
struct dmabuf_genpool_chunk_owner *owner = chunk->owner;
- kvfree(owner->niovs);
+ kvfree(owner->area.niovs);
kfree(owner);
}
static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
{
- struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov);
+ struct dmabuf_genpool_chunk_owner *owner;
+ owner = net_devmem_iov_to_chunk_owner(niov);
return owner->base_dma_addr +
((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
}
@@ -83,7 +91,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
offset = dma_addr - owner->base_dma_addr;
index = offset / PAGE_SIZE;
- niov = &owner->niovs[index];
+ niov = &owner->area.niovs[index];
niov->pp_magic = 0;
niov->pp = NULL;
@@ -94,7 +102,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
void net_devmem_free_dmabuf(struct net_iov *niov)
{
- struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov);
+ struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov);
unsigned long dma_addr = net_devmem_get_dma_addr(niov);
if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr,
@@ -118,6 +126,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
WARN_ON(rxq->mp_params.mp_priv != binding);
rxq->mp_params.mp_priv = NULL;
+ rxq->mp_params.mp_ops = NULL;
rxq_idx = get_netdev_rx_queue_index(rxq);
@@ -154,7 +163,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
}
rxq = __netif_get_rx_queue(dev, rxq_idx);
- if (rxq->mp_params.mp_priv) {
+ if (rxq->mp_params.mp_ops) {
NL_SET_ERR_MSG(extack, "designated queue already memory provider bound");
return -EEXIST;
}
@@ -172,6 +181,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
return err;
rxq->mp_params.mp_priv = binding;
+ rxq->mp_params.mp_ops = &dmabuf_devmem_ops;
err = netdev_rx_queue_restart(dev, rxq_idx);
if (err)
@@ -181,6 +191,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
err_xa_erase:
rxq->mp_params.mp_priv = NULL;
+ rxq->mp_params.mp_ops = NULL;
xa_erase(&binding->bound_rxqs, xa_idx);
return err;
@@ -263,9 +274,9 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
goto err_free_chunks;
}
- owner->base_virtual = virtual;
+ owner->area.base_virtual = virtual;
owner->base_dma_addr = dma_addr;
- owner->num_niovs = len / PAGE_SIZE;
+ owner->area.num_niovs = len / PAGE_SIZE;
owner->binding = binding;
err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
@@ -277,17 +288,17 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
goto err_free_chunks;
}
- owner->niovs = kvmalloc_array(owner->num_niovs,
- sizeof(*owner->niovs),
- GFP_KERNEL);
- if (!owner->niovs) {
+ owner->area.niovs = kvmalloc_array(owner->area.num_niovs,
+ sizeof(*owner->area.niovs),
+ GFP_KERNEL);
+ if (!owner->area.niovs) {
err = -ENOMEM;
goto err_free_chunks;
}
- for (i = 0; i < owner->num_niovs; i++) {
- niov = &owner->niovs[i];
- niov->owner = owner;
+ for (i = 0; i < owner->area.num_niovs; i++) {
+ niov = &owner->area.niovs[i];
+ niov->owner = &owner->area;
page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
net_devmem_get_dma_addr(niov));
}
@@ -315,26 +326,6 @@ err_put_dmabuf:
return ERR_PTR(err);
}
-void dev_dmabuf_uninstall(struct net_device *dev)
-{
- struct net_devmem_dmabuf_binding *binding;
- struct netdev_rx_queue *rxq;
- unsigned long xa_idx;
- unsigned int i;
-
- for (i = 0; i < dev->real_num_rx_queues; i++) {
- binding = dev->_rx[i].mp_params.mp_priv;
- if (!binding)
- continue;
-
- xa_for_each(&binding->bound_rxqs, xa_idx, rxq)
- if (rxq == &dev->_rx[i]) {
- xa_erase(&binding->bound_rxqs, xa_idx);
- break;
- }
- }
-}
-
/*** "Dmabuf devmem memory provider" ***/
int mp_dmabuf_devmem_init(struct page_pool *pool)
@@ -400,3 +391,36 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem)
/* We don't want the page pool put_page()ing our net_iovs. */
return false;
}
+
+static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp,
+ struct netdev_rx_queue *rxq)
+{
+ const struct net_devmem_dmabuf_binding *binding = mp_priv;
+ int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF;
+
+ return nla_put_u32(rsp, type, binding->id);
+}
+
+static void mp_dmabuf_devmem_uninstall(void *mp_priv,
+ struct netdev_rx_queue *rxq)
+{
+ struct net_devmem_dmabuf_binding *binding = mp_priv;
+ struct netdev_rx_queue *bound_rxq;
+ unsigned long xa_idx;
+
+ xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) {
+ if (bound_rxq == rxq) {
+ xa_erase(&binding->bound_rxqs, xa_idx);
+ break;
+ }
+ }
+}
+
+static const struct memory_provider_ops dmabuf_devmem_ops = {
+ .init = mp_dmabuf_devmem_init,
+ .destroy = mp_dmabuf_devmem_destroy,
+ .alloc_netmems = mp_dmabuf_devmem_alloc_netmems,
+ .release_netmem = mp_dmabuf_devmem_release_page,
+ .nl_fill = mp_dmabuf_devmem_nl_fill,
+ .uninstall = mp_dmabuf_devmem_uninstall,
+};