aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/bpf/devmap.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2021-06-28 15:28:03 -0700
committerDavid S. Miller <davem@davemloft.net>2021-06-28 15:28:03 -0700
commite1289cfb634c19b5755452ba03c82aa76c0cfd7c (patch)
treefbe559fefa4b0141b31dc00bc3fbe962741a19f8 /kernel/bpf/devmap.c
parentipv6: ICMPV6: add response to ICMPV6 RFC 8335 PROBE messages (diff)
parentxdp: Move the rxq_info.mem clearing to unreg_mem_model() (diff)
downloadlinux-e1289cfb634c19b5755452ba03c82aa76c0cfd7c.tar.gz
linux-e1289cfb634c19b5755452ba03c82aa76c0cfd7c.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== pull-request: bpf-next 2021-06-28 The following pull-request contains BPF updates for your *net-next* tree. We've added 37 non-merge commits during the last 12 day(s) which contain a total of 56 files changed, 394 insertions(+), 380 deletions(-). The main changes are: 1) XDP driver RCU cleanups, from Toke Høiland-Jørgensen and Paul E. McKenney. 2) Fix bpf_skb_change_proto() IPv4/v6 GSO handling, from Maciej Żenczykowski. 3) Fix false positive kmemleak report for BPF ringbuf alloc, from Rustam Kovhaev. 4) Fix x86 JIT's extable offset calculation for PROBE_LDX NULL, from Ravi Bangoria. 5) Enable libbpf fallback probing with tracing under RHEL7, from Jonathan Edwards. 6) Clean up x86 JIT to remove unused cnt tracking from EMIT macro, from Jiri Olsa. 7) Netlink cleanups for libbpf to please Coverity, from Kumar Kartikeya Dwivedi. 8) Allow to retrieve ancestor cgroup id in tracing programs, from Namhyung Kim. 9) Fix lirc BPF program query to use user-provided prog_cnt, from Sean Young. 10) Add initial libbpf doc including generated kdoc for its API, from Grant Seltzer. 11) Make xdp_rxq_info_unreg_mem_model() more robust, from Jakub Kicinski. 12) Fix up bpfilter startup log-level to info level, from Gary Lin. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/bpf/devmap.c')
-rw-r--r--kernel/bpf/devmap.c49
1 files changed, 21 insertions, 28 deletions
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2a75e6c2d27d..2f6bd75cd682 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -73,7 +73,7 @@ struct bpf_dtab_netdev {
struct bpf_dtab {
struct bpf_map map;
- struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
+ struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */
struct list_head list;
/* these are only used for DEVMAP_HASH type maps */
@@ -226,7 +226,7 @@ static void dev_map_free(struct bpf_map *map)
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;
- dev = dtab->netdev_map[i];
+ dev = rcu_dereference_raw(dtab->netdev_map[i]);
if (!dev)
continue;
@@ -259,6 +259,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -410,15 +414,9 @@ out:
trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
}
-/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
- * from the driver before returning from its napi->poll() routine. The poll()
- * routine is called either from busy_poll context or net_rx_action signaled
- * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
- * net device can be torn down. On devmap tear down we ensure the flush list
- * is empty before completing to ensure all flush operations have completed.
- * When drivers update the bpf program they may need to ensure any flush ops
- * are also complete. Using synchronize_rcu or call_rcu will suffice for this
- * because both wait for napi context to exit.
+/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
+ * driver before returning from its napi->poll() routine. See the comment above
+ * xdp_do_flush() in filter.c.
*/
void __dev_flush(void)
{
@@ -433,9 +431,9 @@ void __dev_flush(void)
}
}
-/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
- * update happens in parallel here a dev_put won't happen until after reading
- * the ifindex.
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
*/
static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
@@ -445,12 +443,14 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
if (key >= map->max_entries)
return NULL;
- obj = READ_ONCE(dtab->netdev_map[key]);
+ obj = rcu_dereference_check(dtab->netdev_map[key],
+ rcu_read_lock_bh_held());
return obj;
}
-/* Runs under RCU-read-side, plus in softirq under NAPI protection.
- * Thus, safe percpu variable access.
+/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
+ * variable access, and map elements stick around. See comment above
+ * xdp_do_flush() in filter.c.
*/
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
@@ -735,14 +735,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
if (k >= map->max_entries)
return -EINVAL;
- /* Use call_rcu() here to ensure any rcu critical sections have
- * completed as well as any flush operations because call_rcu
- * will wait for preempt-disable region to complete, NAPI in this
- * context. And additionally, the driver tear down ensures all
- * soft irqs are complete before removing the net device in the
- * case of dev_put equals zero.
- */
- old_dev = xchg(&dtab->netdev_map[k], NULL);
+ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
return 0;
@@ -851,7 +844,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
* Remembering the driver side flush operation will happen before the
* net device is removed.
*/
- old_dev = xchg(&dtab->netdev_map[i], dev);
+ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
@@ -1031,10 +1024,10 @@ static int dev_map_notification(struct notifier_block *notifier,
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev, *odev;
- dev = READ_ONCE(dtab->netdev_map[i]);
+ dev = rcu_dereference(dtab->netdev_map[i]);
if (!dev || netdev != dev->dev)
continue;
- odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
+ odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
if (dev == odev)
call_rcu(&dev->rcu,
__dev_map_entry_free);