Skip to content

Commit 673e375

Browse files
author
Alexei Starovoitov
committed
Merge branch 'Follow-up BPF helper improvements'
Daniel Borkmann says: ==================== This series addresses most of the feedback [0] that was to be followed up from the last series, that is, UAPI helper comment improvements and getting rid of the ifindex obj file hacks in the selftest by using a BPF map instead. The __sk_buff data/data_end pointer work, I'm planning to do in a later round as well as the mem*() BPF improvements we have in Cilium for libbpf. Next, the series adds two features, i) a helper called redirect_peer() to improve latency on netns switch, and ii) to allow map in map with dynamic inner array map sizes. Selftests for each are added as well. For details, please check individual patches, thanks! [0] https://lore.kernel.org/bpf/[email protected]/ v5 -> v6: - Going with Andrii's suggestion to make the misconfigured verifier test more robust, and only probe on -EOPNOTSUPP (Andrii) v4 -> v5: - Replace cnt == -EOPNOTSUPP check with cnt < 0; I've used < 0 here as I think it's useful to keep the existing cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) for error detection (Andrii) v3 -> v4: - Rename new array map flag to BPF_F_INNER_MAP (Alexei) v2 -> v3: - Remove tab that slipped into uapi helper desc (Jakub) - Rework map in map for array to error from map_gen_lookup (Andrii) v1 -> v2: - Fixed selftest comment wrt inner1/inner2 value (Yonghong) ==================== Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents ac53a0d + 9f4c53c commit 673e375

File tree

17 files changed

+489
-225
lines changed

17 files changed

+489
-225
lines changed

drivers/net/veth.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev)
420420
return smp_processor_id() % dev->real_num_rx_queues;
421421
}
422422

423+
static struct net_device *veth_peer_dev(struct net_device *dev)
424+
{
425+
struct veth_priv *priv = netdev_priv(dev);
426+
427+
/* Callers must be under RCU read side. */
428+
return rcu_dereference(priv->peer);
429+
}
430+
423431
static int veth_xdp_xmit(struct net_device *dev, int n,
424432
struct xdp_frame **frames,
425433
u32 flags, bool ndo_xmit)
@@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = {
12241232
.ndo_set_rx_headroom = veth_set_rx_headroom,
12251233
.ndo_bpf = veth_xdp,
12261234
.ndo_xdp_xmit = veth_ndo_xdp_xmit,
1235+
.ndo_get_peer_dev = veth_peer_dev,
12271236
};
12281237

12291238
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \

include/linux/bpf.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ struct bpf_map_ops {
8282
void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
8383
int fd);
8484
void (*map_fd_put_ptr)(void *ptr);
85-
u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
85+
int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
8686
u32 (*map_fd_sys_lookup_elem)(void *ptr);
8787
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
8888
struct seq_file *m);

include/linux/netdevice.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,6 +1277,9 @@ struct netdev_net_notifier {
12771277
* int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
12781278
* int cmd);
12791279
* Add, change, delete or get information on an IPv4 tunnel.
1280+
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
1281+
* If a device is paired with a peer device, return the peer instance.
1282+
* The caller must be under RCU read context.
12801283
*/
12811284
struct net_device_ops {
12821285
int (*ndo_init)(struct net_device *dev);
@@ -1484,6 +1487,7 @@ struct net_device_ops {
14841487
struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev);
14851488
int (*ndo_tunnel_ctl)(struct net_device *dev,
14861489
struct ip_tunnel_parm *p, int cmd);
1490+
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
14871491
};
14881492

14891493
/**

include/uapi/linux/bpf.h

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,9 @@ enum {
435435

436436
/* Share perf_event among processes */
437437
BPF_F_PRESERVE_ELEMS = (1U << 11),
438+
439+
/* Create a map that is suitable to be an inner map with dynamic max entries */
440+
BPF_F_INNER_MAP = (1U << 12),
438441
};
439442

440443
/* Flags for BPF_PROG_QUERY. */
@@ -3679,10 +3682,14 @@ union bpf_attr {
36793682
* Redirect the packet to another net device of index *ifindex*
36803683
* and fill in L2 addresses from neighboring subsystem. This helper
36813684
* is somewhat similar to **bpf_redirect**\ (), except that it
3682-
* fills in e.g. MAC addresses based on the L3 information from
3683-
* the packet. This helper is supported for IPv4 and IPv6 protocols.
3685+
* populates L2 addresses as well, meaning, internally, the helper
3686+
* performs a FIB lookup based on the skb's networking header to
3687+
* get the address of the next hop and then relies on the neighbor
3688+
* lookup for the L2 address of the nexthop.
3689+
*
36843690
* The *flags* argument is reserved and must be 0. The helper is
3685-
* currently only supported for tc BPF program types.
3691+
* currently only supported for tc BPF program types, and enabled
3692+
* for IPv4 and IPv6 protocols.
36863693
* Return
36873694
* The helper returns **TC_ACT_REDIRECT** on success or
36883695
* **TC_ACT_SHOT** on error.
@@ -3715,6 +3722,22 @@ union bpf_attr {
37153722
* never return NULL.
37163723
* Return
37173724
* A pointer pointing to the kernel percpu variable on this cpu.
3725+
*
3726+
* long bpf_redirect_peer(u32 ifindex, u64 flags)
3727+
* Description
3728+
* Redirect the packet to another net device of index *ifindex*.
3729+
* This helper is somewhat similar to **bpf_redirect**\ (), except
3730+
* that the redirection happens to the *ifindex*' peer device and
3731+
* the netns switch takes place from ingress to ingress without
3732+
* going through the CPU's backlog queue.
3733+
*
3734+
* The *flags* argument is reserved and must be 0. The helper is
3735+
* currently only supported for tc BPF program types at the ingress
3736+
* hook and for veth device types. The peer device must reside in a
3737+
* different network namespace.
3738+
* Return
3739+
* The helper returns **TC_ACT_REDIRECT** on success or
3740+
* **TC_ACT_SHOT** on error.
37183741
*/
37193742
#define __BPF_FUNC_MAPPER(FN) \
37203743
FN(unspec), \
@@ -3872,6 +3895,7 @@ union bpf_attr {
38723895
FN(redirect_neigh), \
38733896
FN(bpf_per_cpu_ptr), \
38743897
FN(bpf_this_cpu_ptr), \
3898+
FN(redirect_peer), \
38753899
/* */
38763900

38773901
/* integer value in 'imm' field of BPF_CALL instruction selects which helper

kernel/bpf/arraymap.c

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
#define ARRAY_CREATE_FLAG_MASK \
1818
(BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
19-
BPF_F_PRESERVE_ELEMS)
19+
BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
2020

2121
static void bpf_array_free_percpu(struct bpf_array *array)
2222
{
@@ -62,7 +62,7 @@ int array_map_alloc_check(union bpf_attr *attr)
6262
return -EINVAL;
6363

6464
if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
65-
attr->map_flags & BPF_F_MMAPABLE)
65+
attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
6666
return -EINVAL;
6767

6868
if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
@@ -214,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
214214
}
215215

216216
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
217-
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
217+
static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
218218
{
219219
struct bpf_array *array = container_of(map, struct bpf_array, map);
220220
struct bpf_insn *insn = insn_buf;
@@ -223,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
223223
const int map_ptr = BPF_REG_1;
224224
const int index = BPF_REG_2;
225225

226+
if (map->map_flags & BPF_F_INNER_MAP)
227+
return -EOPNOTSUPP;
228+
226229
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
227230
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
228231
if (!map->bypass_spec_v1) {
@@ -496,8 +499,10 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
496499
static bool array_map_meta_equal(const struct bpf_map *meta0,
497500
const struct bpf_map *meta1)
498501
{
499-
return meta0->max_entries == meta1->max_entries &&
500-
bpf_map_meta_equal(meta0, meta1);
502+
if (!bpf_map_meta_equal(meta0, meta1))
503+
return false;
504+
return meta0->map_flags & BPF_F_INNER_MAP ? true :
505+
meta0->max_entries == meta1->max_entries;
501506
}
502507

503508
struct bpf_iter_seq_array_map_info {
@@ -1251,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
12511256
return READ_ONCE(*inner_map);
12521257
}
12531258

1254-
static u32 array_of_map_gen_lookup(struct bpf_map *map,
1259+
static int array_of_map_gen_lookup(struct bpf_map *map,
12551260
struct bpf_insn *insn_buf)
12561261
{
12571262
struct bpf_array *array = container_of(map, struct bpf_array, map);

kernel/bpf/hashtab.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
612612
* bpf_prog
613613
* __htab_map_lookup_elem
614614
*/
615-
static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
615+
static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
616616
{
617617
struct bpf_insn *insn = insn_buf;
618618
const int ret = BPF_REG_0;
@@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
651651
return __htab_lru_map_lookup_elem(map, key, false);
652652
}
653653

654-
static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
654+
static int htab_lru_map_gen_lookup(struct bpf_map *map,
655655
struct bpf_insn *insn_buf)
656656
{
657657
struct bpf_insn *insn = insn_buf;
@@ -2070,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
20702070
return READ_ONCE(*inner_map);
20712071
}
20722072

2073-
static u32 htab_of_map_gen_lookup(struct bpf_map *map,
2073+
static int htab_of_map_gen_lookup(struct bpf_map *map,
20742074
struct bpf_insn *insn_buf)
20752075
{
20762076
struct bpf_insn *insn = insn_buf;

kernel/bpf/verifier.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11049,7 +11049,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
1104911049
if (insn->imm == BPF_FUNC_map_lookup_elem &&
1105011050
ops->map_gen_lookup) {
1105111051
cnt = ops->map_gen_lookup(map_ptr, insn_buf);
11052-
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
11052+
if (cnt == -EOPNOTSUPP)
11053+
goto patch_map_ops_generic;
11054+
if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
1105311055
verbose(env, "bpf verifier is misconfigured\n");
1105411056
return -EINVAL;
1105511057
}
@@ -11079,7 +11081,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
1107911081
(int (*)(struct bpf_map *map, void *value))NULL));
1108011082
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
1108111083
(int (*)(struct bpf_map *map, void *value))NULL));
11082-
11084+
patch_map_ops_generic:
1108311085
switch (insn->imm) {
1108411086
case BPF_FUNC_map_lookup_elem:
1108511087
insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -

net/core/dev.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
49304930

49314931
static inline struct sk_buff *
49324932
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4933-
struct net_device *orig_dev)
4933+
struct net_device *orig_dev, bool *another)
49344934
{
49354935
#ifdef CONFIG_NET_CLS_ACT
49364936
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
@@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
49744974
* redirecting to another netdev
49754975
*/
49764976
__skb_push(skb, skb->mac_len);
4977-
skb_do_redirect(skb);
4977+
if (skb_do_redirect(skb) == -EAGAIN) {
4978+
__skb_pull(skb, skb->mac_len);
4979+
*another = true;
4980+
break;
4981+
}
49784982
return NULL;
49794983
case TC_ACT_CONSUMED:
49804984
return NULL;
@@ -5163,7 +5167,12 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
51635167
skip_taps:
51645168
#ifdef CONFIG_NET_INGRESS
51655169
if (static_branch_unlikely(&ingress_needed_key)) {
5166-
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
5170+
bool another = false;
5171+
5172+
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5173+
&another);
5174+
if (another)
5175+
goto another_round;
51675176
if (!skb)
51685177
goto out;
51695178

net/core/filter.c

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2380,8 +2380,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
23802380

23812381
/* Internal, non-exposed redirect flags. */
23822382
enum {
2383-
BPF_F_NEIGH = (1ULL << 1),
2384-
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH)
2383+
BPF_F_NEIGH = (1ULL << 1),
2384+
BPF_F_PEER = (1ULL << 2),
2385+
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER)
23852386
};
23862387

23872388
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@@ -2430,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
24302431
int skb_do_redirect(struct sk_buff *skb)
24312432
{
24322433
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2434+
struct net *net = dev_net(skb->dev);
24332435
struct net_device *dev;
24342436
u32 flags = ri->flags;
24352437

2436-
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
2438+
dev = dev_get_by_index_rcu(net, ri->tgt_index);
24372439
ri->tgt_index = 0;
2438-
if (unlikely(!dev)) {
2439-
kfree_skb(skb);
2440-
return -EINVAL;
2440+
ri->flags = 0;
2441+
if (unlikely(!dev))
2442+
goto out_drop;
2443+
if (flags & BPF_F_PEER) {
2444+
const struct net_device_ops *ops = dev->netdev_ops;
2445+
2446+
if (unlikely(!ops->ndo_get_peer_dev ||
2447+
!skb_at_tc_ingress(skb)))
2448+
goto out_drop;
2449+
dev = ops->ndo_get_peer_dev(dev);
2450+
if (unlikely(!dev ||
2451+
!is_skb_forwardable(dev, skb) ||
2452+
net_eq(net, dev_net(dev))))
2453+
goto out_drop;
2454+
skb->dev = dev;
2455+
return -EAGAIN;
24412456
}
2442-
24432457
return flags & BPF_F_NEIGH ?
24442458
__bpf_redirect_neigh(skb, dev) :
24452459
__bpf_redirect(skb, dev, flags);
2460+
out_drop:
2461+
kfree_skb(skb);
2462+
return -EINVAL;
24462463
}
24472464

24482465
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
@@ -2466,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = {
24662483
.arg2_type = ARG_ANYTHING,
24672484
};
24682485

2486+
BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
2487+
{
2488+
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2489+
2490+
if (unlikely(flags))
2491+
return TC_ACT_SHOT;
2492+
2493+
ri->flags = BPF_F_PEER;
2494+
ri->tgt_index = ifindex;
2495+
2496+
return TC_ACT_REDIRECT;
2497+
}
2498+
2499+
static const struct bpf_func_proto bpf_redirect_peer_proto = {
2500+
.func = bpf_redirect_peer,
2501+
.gpl_only = false,
2502+
.ret_type = RET_INTEGER,
2503+
.arg1_type = ARG_ANYTHING,
2504+
.arg2_type = ARG_ANYTHING,
2505+
};
2506+
24692507
BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
24702508
{
24712509
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
@@ -7053,6 +7091,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
70537091
return &bpf_redirect_proto;
70547092
case BPF_FUNC_redirect_neigh:
70557093
return &bpf_redirect_neigh_proto;
7094+
case BPF_FUNC_redirect_peer:
7095+
return &bpf_redirect_peer_proto;
70567096
case BPF_FUNC_get_route_realm:
70577097
return &bpf_get_route_realm_proto;
70587098
case BPF_FUNC_get_hash_recalc:

net/xdp/xskmap.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
132132
return 0;
133133
}
134134

135-
static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
135+
static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
136136
{
137137
const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
138138
struct bpf_insn *insn = insn_buf;

0 commit comments

Comments
 (0)