Topics

[RFC PATCH 1/3] bpf: add bpf queue map

Mauricio Vasquez
 

Bpf queue implements a LIFO/FIFO data containers for ebpf programs.

It allows to push an element to the queue by using the update operation
and to pop an element from the queue by using the lookup operation.

A use case for this is to keep track of a pool of elements, like
network ports in a SNAT.

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@...>
---
include/linux/bpf_types.h | 1
include/uapi/linux/bpf.h | 5 +
kernel/bpf/Makefile | 2
kernel/bpf/queuemap.c | 204 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 61 ++++++++++---
kernel/bpf/verifier.c | 10 ++
6 files changed, 265 insertions(+), 18 deletions(-)
create mode 100644 kernel/bpf/queuemap.c

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c5700c2d5549..6c7a62f3fe43 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -58,3 +58,4 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
#endif
#endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0ebaaf7f3568..2c171c40eb45 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -120,6 +120,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_CPUMAP,
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
+ BPF_MAP_TYPE_QUEUE,
};

enum bpf_prog_type {
@@ -255,6 +256,10 @@ enum bpf_attach_type {
/* Flag for stack_map, store build_id+offset instead of pointer */
#define BPF_F_STACK_BUILD_ID (1U << 5)

+/* Flags for queue_map, type of queue */
+#define BPF_F_QUEUE_FIFO (1U << 16)
+#define BPF_F_QUEUE_LIFO (2U << 16)
+
enum bpf_stack_build_id_status {
/* user space need an empty entry to identify end of a trace */
BPF_STACK_BUILD_ID_EMPTY = 0,
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f27f5496d6fe..30f02ef66635 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
obj-y := core.o

obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o queuemap.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/queuemap.c b/kernel/bpf/queuemap.c
new file mode 100644
index 000000000000..af179562c9b7
--- /dev/null
+++ b/kernel/bpf/queuemap.c
@@ -0,0 +1,204 @@
+/* Copyright (c) 2018 Politecnico di Torino
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+
+#define QUEUE_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
+ BPF_F_QUEUE_FIFO | BPF_F_QUEUE_LIFO)
+
+enum queue_type {
+ QUEUE_FIFO = (BPF_F_QUEUE_FIFO >> 16),
+ QUEUE_LIFO = (BPF_F_QUEUE_LIFO >> 16),
+};
+
+struct bpf_queue {
+ struct bpf_map map;
+ struct list_head head;
+ enum queue_type type;
+ raw_spinlock_t lock;
+ atomic_t count;
+};
+
+struct queue_node {
+ struct list_head list;
+ struct rcu_head rcu;
+ char element[0] __aligned(8);
+};
+
+/* Called from syscall */
+static int queue_map_alloc_check(union bpf_attr *attr)
+{
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 0 ||
+ attr->value_size == 0 || attr->map_flags & ~QUEUE_CREATE_FLAG_MASK)
+ return -EINVAL;
+
+ if ((attr->map_flags >> 16) != QUEUE_FIFO &&
+ (attr->map_flags >> 16) != QUEUE_LIFO) {
+ return -EINVAL;
+ }
+
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
+static struct bpf_map *queue_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_queue *queue;
+
+ queue = kzalloc(sizeof(*queue), GFP_USER);
+ if (!queue)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&queue->map, attr);
+ INIT_LIST_HEAD(&queue->head);
+
+ raw_spin_lock_init(&queue->lock);
+
+ queue->type = attr->map_flags >> 16;
+
+ return &queue->map;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void queue_map_free(struct bpf_map *map)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ struct queue_node *l;
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of free_queue_elem() callbacks for elements of this map may
+ * not have executed. Wait for them.
+ */
+ rcu_barrier();
+
+ list_for_each_entry_rcu(l, &queue->head, list) {
+ list_del_rcu(&l->list);
+ kfree(l);
+ }
+
+ kfree(queue);
+}
+
+static void queue_elem_free_rcu(struct rcu_head *head)
+{
+ struct queue_node *l = container_of(head, struct queue_node, rcu);
+
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
+ * we're calling kfree, otherwise deadlock is possible if kprobes
+ * are placed somewhere inside of slub
+ */
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ kfree(l);
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+}
+
+/* Called from syscall or from eBPF program */
+static void *queue_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ unsigned long flags;
+ struct queue_node *node;
+
+ raw_spin_lock_irqsave(&queue->lock, flags);
+
+ node = list_first_or_null_rcu(&queue->head, struct queue_node, list);
+ if (!node) {
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+ return NULL;
+ }
+
+ atomic_dec(&queue->count);
+ list_del_rcu(&node->list);
+ call_rcu(&node->rcu, queue_elem_free_rcu);
+
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+
+ return &node->element;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ unsigned long flags;
+
+ struct queue_node *new;
+
+ if (atomic_inc_return(&queue->count) > queue->map.max_entries) {
+ atomic_dec(&queue->count);
+ return -E2BIG;
+ }
+
+ new = kmalloc_node(sizeof(*new) + round_up(queue->map.value_size, 8),
+ GFP_ATOMIC | __GFP_NOWARN, queue->map.numa_node);
+ if (!new) {
+ atomic_dec(&queue->count);
+ return -ENOMEM;
+ }
+ memcpy(new->element, value, queue->map.value_size);
+
+ raw_spin_lock_irqsave(&queue->lock, flags);
+ switch (queue->type) {
+ case QUEUE_FIFO:
+ list_add_tail_rcu(&new->list, &queue->head);
+ break;
+
+ case QUEUE_LIFO:
+ list_add_rcu(&new->list, &queue->head);
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called from syscall */
+static int queue_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -EINVAL;
+}
+
+const struct bpf_map_ops queue_map_ops = {
+ .map_alloc_check = queue_map_alloc_check,
+ .map_alloc = queue_map_alloc,
+ .map_free = queue_map_free,
+ .map_lookup_elem = queue_map_lookup_elem,
+ .map_update_elem = queue_map_update_elem,
+ .map_delete_elem = queue_map_delete_elem,
+ .map_get_next_key = queue_map_get_next_key,
+};
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a31a1ba0f8ea..7e9a11d69eef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -622,11 +622,19 @@ static int map_lookup_elem(union bpf_attr *attr)
err = -EPERM;
goto err_put;
}
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ key = NULL;
}

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -709,10 +717,19 @@ static int map_update_elem(union bpf_attr *attr)
goto err_put;
}

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ key = NULL;
}

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -803,10 +820,19 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ key = NULL;
}

if (bpf_map_is_dev_bound(map)) {
@@ -855,9 +881,14 @@ static int map_get_next_key(union bpf_attr *attr)
}

if (ukey) {
- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ err = -EINVAL;
goto err_put;
}
} else {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 25e47c195874..b9e730090822 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1980,7 +1980,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
- type != expected_type)
+ type != expected_type && type != SCALAR_VALUE)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -2021,6 +2021,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
meta->map_ptr = reg->map_ptr;
} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ bool zero_size_allowed = false;
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
@@ -2034,8 +2035,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
+
+ if (meta->map_ptr->map_type == BPF_MAP_TYPE_QUEUE)
+ zero_size_allowed = true;
+
err = check_helper_mem_access(env, regno,
- meta->map_ptr->key_size, false,
+ meta->map_ptr->key_size,
+ zero_size_allowed,
NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:

Yonghong Song
 

On Tue, Jul 31, 2018 at 1:53 PM, Mauricio Vasquez
<mauricio.vasquez@...> wrote:
Bpf queue implements a LIFO/FIFO data containers for ebpf programs.

It allows to push an element to the queue by using the update operation
and to pop an element from the queue by using the lookup operation.

A use case for this is to keep track of a pool of elements, like
network ports in a SNAT.

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@...>
---
include/linux/bpf_types.h | 1
include/uapi/linux/bpf.h | 5 +
kernel/bpf/Makefile | 2
kernel/bpf/queuemap.c | 204 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 61 ++++++++++---
kernel/bpf/verifier.c | 10 ++
6 files changed, 265 insertions(+), 18 deletions(-)
create mode 100644 kernel/bpf/queuemap.c

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c5700c2d5549..6c7a62f3fe43 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -58,3 +58,4 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
#endif
#endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0ebaaf7f3568..2c171c40eb45 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -120,6 +120,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_CPUMAP,
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
+ BPF_MAP_TYPE_QUEUE,
};

enum bpf_prog_type {
@@ -255,6 +256,10 @@ enum bpf_attach_type {
/* Flag for stack_map, store build_id+offset instead of pointer */
#define BPF_F_STACK_BUILD_ID (1U << 5)

+/* Flags for queue_map, type of queue */
+#define BPF_F_QUEUE_FIFO (1U << 16)
+#define BPF_F_QUEUE_LIFO (2U << 16)
+
enum bpf_stack_build_id_status {
/* user space need an empty entry to identify end of a trace */
BPF_STACK_BUILD_ID_EMPTY = 0,
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f27f5496d6fe..30f02ef66635 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
obj-y := core.o

obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o queuemap.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/queuemap.c b/kernel/bpf/queuemap.c
new file mode 100644
index 000000000000..af179562c9b7
--- /dev/null
+++ b/kernel/bpf/queuemap.c
@@ -0,0 +1,204 @@
+/* Copyright (c) 2018 Politecnico di Torino
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
It is totally up to you, but you could replace the above licensing portion from
"This program" to 'more details" with
/* SPDX-License-Identifier: GPL-2.0 */

+#include <linux/bpf.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+
+#define QUEUE_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
+ BPF_F_QUEUE_FIFO | BPF_F_QUEUE_LIFO)
+
+enum queue_type {
+ QUEUE_FIFO = (BPF_F_QUEUE_FIFO >> 16),
+ QUEUE_LIFO = (BPF_F_QUEUE_LIFO >> 16),
+};
+
+struct bpf_queue {
+ struct bpf_map map;
+ struct list_head head;
+ enum queue_type type;
+ raw_spinlock_t lock;
+ atomic_t count;
+};
+
+struct queue_node {
+ struct list_head list;
+ struct rcu_head rcu;
+ char element[0] __aligned(8);
+};
+
+/* Called from syscall */
+static int queue_map_alloc_check(union bpf_attr *attr)
+{
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 0 ||
+ attr->value_size == 0 || attr->map_flags & ~QUEUE_CREATE_FLAG_MASK)
+ return -EINVAL;
+
+ if ((attr->map_flags >> 16) != QUEUE_FIFO &&
+ (attr->map_flags >> 16) != QUEUE_LIFO) {
+ return -EINVAL;
+ }
+
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
+static struct bpf_map *queue_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_queue *queue;
+
+ queue = kzalloc(sizeof(*queue), GFP_USER);
+ if (!queue)
+ return ERR_PTR(-ENOMEM);
Similar to other maps, you will need to calculate the cost
of the map based on maximum number of entries and
precharge the locked memory. If precharging failed, map
creation should fail.

+
+ bpf_map_init_from_attr(&queue->map, attr);
+ INIT_LIST_HEAD(&queue->head);
+
+ raw_spin_lock_init(&queue->lock);
+
+ queue->type = attr->map_flags >> 16;
+
+ return &queue->map;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void queue_map_free(struct bpf_map *map)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ struct queue_node *l;
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of free_queue_elem() callbacks for elements of this map may
+ * not have executed. Wait for them.
+ */
+ rcu_barrier();
+
+ list_for_each_entry_rcu(l, &queue->head, list) {
+ list_del_rcu(&l->list);
+ kfree(l);
+ }
+
+ kfree(queue);
+}
+
+static void queue_elem_free_rcu(struct rcu_head *head)
+{
+ struct queue_node *l = container_of(head, struct queue_node, rcu);
+
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
+ * we're calling kfree, otherwise deadlock is possible if kprobes
+ * are placed somewhere inside of slub
+ */
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ kfree(l);
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+}
+
+/* Called from syscall or from eBPF program */
+static void *queue_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ unsigned long flags;
+ struct queue_node *node;
+
+ raw_spin_lock_irqsave(&queue->lock, flags);
+
+ node = list_first_or_null_rcu(&queue->head, struct queue_node, list);
+ if (!node) {
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+ return NULL;
+ }
+
+ atomic_dec(&queue->count);
+ list_del_rcu(&node->list);
+ call_rcu(&node->rcu, queue_elem_free_rcu);
+
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+
+ return &node->element;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ unsigned long flags;
+
+ struct queue_node *new;
+
+ if (atomic_inc_return(&queue->count) > queue->map.max_entries) {
+ atomic_dec(&queue->count);
+ return -E2BIG;
+ }
+
+ new = kmalloc_node(sizeof(*new) + round_up(queue->map.value_size, 8),
+ GFP_ATOMIC | __GFP_NOWARN, queue->map.numa_node);
+ if (!new) {
+ atomic_dec(&queue->count);
+ return -ENOMEM;
+ }
+ memcpy(new->element, value, queue->map.value_size);
+
+ raw_spin_lock_irqsave(&queue->lock, flags);
+ switch (queue->type) {
+ case QUEUE_FIFO:
+ list_add_tail_rcu(&new->list, &queue->head);
+ break;
+
+ case QUEUE_LIFO:
+ list_add_rcu(&new->list, &queue->head);
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+
+ return 0;
+}
For multiple cpu system (e.g., SNAT), the above lookup/update
performance could be really bad? Could you do some
estimation?

Based on performance evaluation, here every time, we tried
to call kmalloc_node to get a node, maybe we can use
a free-list (common or per cpu, see hashmap) to speed it up?

+
+/* Called from syscall or from eBPF program */
+static int queue_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called from syscall */
+static int queue_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -EINVAL;
+}
+
+const struct bpf_map_ops queue_map_ops = {
+ .map_alloc_check = queue_map_alloc_check,
+ .map_alloc = queue_map_alloc,
+ .map_free = queue_map_free,
+ .map_lookup_elem = queue_map_lookup_elem,
+ .map_update_elem = queue_map_update_elem,
+ .map_delete_elem = queue_map_delete_elem,
+ .map_get_next_key = queue_map_get_next_key,
+};
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a31a1ba0f8ea..7e9a11d69eef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -622,11 +622,19 @@ static int map_lookup_elem(union bpf_attr *attr)
err = -EPERM;
goto err_put;
}
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ key = NULL;
}

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -709,10 +717,19 @@ static int map_update_elem(union bpf_attr *attr)
goto err_put;
}

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ key = NULL;
}

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -803,10 +820,19 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ key = NULL;
}

if (bpf_map_is_dev_bound(map)) {
@@ -855,9 +881,14 @@ static int map_get_next_key(union bpf_attr *attr)
}

if (ukey) {
- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ err = -EINVAL;
goto err_put;
}
} else {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 25e47c195874..b9e730090822 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1980,7 +1980,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
- type != expected_type)
+ type != expected_type && type != SCALAR_VALUE)
goto err_type;
Why you have "type != SCALAR_VALUE" here?

} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -2021,6 +2021,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
meta->map_ptr = reg->map_ptr;
} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ bool zero_size_allowed = false;
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
@@ -2034,8 +2035,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
+
+ if (meta->map_ptr->map_type == BPF_MAP_TYPE_QUEUE)
+ zero_size_allowed = true;
+
err = check_helper_mem_access(env, regno,
- meta->map_ptr->key_size, false,
+ meta->map_ptr->key_size,
+ zero_size_allowed,
NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:



Mauricio Vasquez
 

On 08/01/2018 12:34 AM, Y Song wrote:
On Tue, Jul 31, 2018 at 1:53 PM, Mauricio Vasquez
<mauricio.vasquez@...> wrote:
Bpf queue implements a LIFO/FIFO data containers for ebpf programs.

It allows to push an element to the queue by using the update operation
and to pop an element from the queue by using the lookup operation.

A use case for this is to keep track of a pool of elements, like
network ports in a SNAT.

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@...>
---
include/linux/bpf_types.h | 1
include/uapi/linux/bpf.h | 5 +
kernel/bpf/Makefile | 2
kernel/bpf/queuemap.c | 204 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 61 ++++++++++---
kernel/bpf/verifier.c | 10 ++
6 files changed, 265 insertions(+), 18 deletions(-)
create mode 100644 kernel/bpf/queuemap.c

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c5700c2d5549..6c7a62f3fe43 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -58,3 +58,4 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
#endif
#endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0ebaaf7f3568..2c171c40eb45 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -120,6 +120,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_CPUMAP,
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
+ BPF_MAP_TYPE_QUEUE,
};

enum bpf_prog_type {
@@ -255,6 +256,10 @@ enum bpf_attach_type {
/* Flag for stack_map, store build_id+offset instead of pointer */
#define BPF_F_STACK_BUILD_ID (1U << 5)

+/* Flags for queue_map, type of queue */
+#define BPF_F_QUEUE_FIFO (1U << 16)
+#define BPF_F_QUEUE_LIFO (2U << 16)
+
enum bpf_stack_build_id_status {
/* user space need an empty entry to identify end of a trace */
BPF_STACK_BUILD_ID_EMPTY = 0,
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f27f5496d6fe..30f02ef66635 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
obj-y := core.o

obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o queuemap.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/queuemap.c b/kernel/bpf/queuemap.c
new file mode 100644
index 000000000000..af179562c9b7
--- /dev/null
+++ b/kernel/bpf/queuemap.c
@@ -0,0 +1,204 @@
+/* Copyright (c) 2018 Politecnico di Torino
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
It is totally up to you, but you could replace the above licensing portion from
"This program" to 'more details" with
/* SPDX-License-Identifier: GPL-2.0 */I have no any preference over that, will do that way.
+#include <linux/bpf.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+
+#define QUEUE_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
+ BPF_F_QUEUE_FIFO | BPF_F_QUEUE_LIFO)
+
+enum queue_type {
+ QUEUE_FIFO = (BPF_F_QUEUE_FIFO >> 16),
+ QUEUE_LIFO = (BPF_F_QUEUE_LIFO >> 16),
+};
+
+struct bpf_queue {
+ struct bpf_map map;
+ struct list_head head;
+ enum queue_type type;
+ raw_spinlock_t lock;
+ atomic_t count;
+};
+
+struct queue_node {
+ struct list_head list;
+ struct rcu_head rcu;
+ char element[0] __aligned(8);
+};
+
+/* Called from syscall */
+static int queue_map_alloc_check(union bpf_attr *attr)
+{
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 0 ||
+ attr->value_size == 0 || attr->map_flags & ~QUEUE_CREATE_FLAG_MASK)
+ return -EINVAL;
+
+ if ((attr->map_flags >> 16) != QUEUE_FIFO &&
+ (attr->map_flags >> 16) != QUEUE_LIFO) {
+ return -EINVAL;
+ }
+
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
+static struct bpf_map *queue_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_queue *queue;
+
+ queue = kzalloc(sizeof(*queue), GFP_USER);
+ if (!queue)
+ return ERR_PTR(-ENOMEM);
Similar to other maps, you will need to calculate the cost
of the map based on maximum number of entries and
precharge the locked memory. If precharging failed, map
creation should fail.I missed that, will do.
+
+ bpf_map_init_from_attr(&queue->map, attr);
+ INIT_LIST_HEAD(&queue->head);
+
+ raw_spin_lock_init(&queue->lock);
+
+ queue->type = attr->map_flags >> 16;
+
+ return &queue->map;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void queue_map_free(struct bpf_map *map)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ struct queue_node *l;
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of free_queue_elem() callbacks for elements of this map may
+ * not have executed. Wait for them.
+ */
+ rcu_barrier();
+
+ list_for_each_entry_rcu(l, &queue->head, list) {
+ list_del_rcu(&l->list);
+ kfree(l);
+ }
+
+ kfree(queue);
+}
+
+static void queue_elem_free_rcu(struct rcu_head *head)
+{
+ struct queue_node *l = container_of(head, struct queue_node, rcu);
+
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
+ * we're calling kfree, otherwise deadlock is possible if kprobes
+ * are placed somewhere inside of slub
+ */
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ kfree(l);
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+}
+
+/* Called from syscall or from eBPF program */
+static void *queue_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ unsigned long flags;
+ struct queue_node *node;
+
+ raw_spin_lock_irqsave(&queue->lock, flags);
+
+ node = list_first_or_null_rcu(&queue->head, struct queue_node, list);
+ if (!node) {
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+ return NULL;
+ }
+
+ atomic_dec(&queue->count);
+ list_del_rcu(&node->list);
+ call_rcu(&node->rcu, queue_elem_free_rcu);
+
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+
+ return &node->element;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ unsigned long flags;
+
+ struct queue_node *new;
+
+ if (atomic_inc_return(&queue->count) > queue->map.max_entries) {
+ atomic_dec(&queue->count);
+ return -E2BIG;
+ }
+
+ new = kmalloc_node(sizeof(*new) + round_up(queue->map.value_size, 8),
+ GFP_ATOMIC | __GFP_NOWARN, queue->map.numa_node);
+ if (!new) {
+ atomic_dec(&queue->count);
+ return -ENOMEM;
+ }
+ memcpy(new->element, value, queue->map.value_size);
+
+ raw_spin_lock_irqsave(&queue->lock, flags);
+ switch (queue->type) {
+ case QUEUE_FIFO:
+ list_add_tail_rcu(&new->list, &queue->head);
+ break;
+
+ case QUEUE_LIFO:
+ list_add_rcu(&new->list, &queue->head);
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+
+ return 0;
+}
For multiple cpu system (e.g., SNAT), the above lookup/update
performance could be really bad? Could you do some
estimation?

Based on performance evaluation, here every time, we tried
to call kmalloc_node to get a node, maybe we can use
a free-list (common or per cpu, see hashmap) to speed it up?
I realized that I totally missed the support for preallocating elements.
In v2 I implemented a per cpu free list based schema, very close to the hashmap.
+
+/* Called from syscall or from eBPF program */
+static int queue_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called from syscall */
+static int queue_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -EINVAL;
+}
+
+const struct bpf_map_ops queue_map_ops = {
+ .map_alloc_check = queue_map_alloc_check,
+ .map_alloc = queue_map_alloc,
+ .map_free = queue_map_free,
+ .map_lookup_elem = queue_map_lookup_elem,
+ .map_update_elem = queue_map_update_elem,
+ .map_delete_elem = queue_map_delete_elem,
+ .map_get_next_key = queue_map_get_next_key,
+};
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a31a1ba0f8ea..7e9a11d69eef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -622,11 +622,19 @@ static int map_lookup_elem(union bpf_attr *attr)
err = -EPERM;
goto err_put;
}
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ key = NULL;
}

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -709,10 +717,19 @@ static int map_update_elem(union bpf_attr *attr)
goto err_put;
}

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ key = NULL;
}

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -803,10 +820,19 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ key = NULL;
}

if (bpf_map_is_dev_bound(map)) {
@@ -855,9 +881,14 @@ static int map_get_next_key(union bpf_attr *attr)
}

if (ukey) {
- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ err = -EINVAL;
goto err_put;
}
} else {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 25e47c195874..b9e730090822 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1980,7 +1980,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
- type != expected_type)
+ type != expected_type && type != SCALAR_VALUE)
goto err_type;
Why you have "type != SCALAR_VALUE" here?
I need this because the key for the bpf_map_[lookup, update]_element operations in the queue map must be NULL.
The below checks guarantee that NULL is only allowed for queue maps.


} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -2021,6 +2021,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
meta->map_ptr = reg->map_ptr;
} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ bool zero_size_allowed = false;
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
@@ -2034,8 +2035,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
+
+ if (meta->map_ptr->map_type == BPF_MAP_TYPE_QUEUE)
+ zero_size_allowed = true;
+
err = check_helper_mem_access(env, regno,
- meta->map_ptr->key_size, false,
+ meta->map_ptr->key_size,
+ zero_size_allowed,
NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call: