Topics

[RFC PATCH v2 1/3] bpf: add bpf queue map

Mauricio Vasquez
 

Bpf queue implements a LIFO/FIFO data containers for ebpf programs.

It allows to push an element to the queue by using the update operation
and to pop an element from the queue by using the lookup operation.

A use case for this is to keep track of a pool of elements, like
network ports in a SNAT.

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@...>
---
include/linux/bpf_types.h | 1
include/uapi/linux/bpf.h | 5 +
kernel/bpf/Makefile | 2
kernel/bpf/queuemap.c | 285 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 61 +++++++---
kernel/bpf/verifier.c | 10 +-
6 files changed, 346 insertions(+), 18 deletions(-)
create mode 100644 kernel/bpf/queuemap.c

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c5700c2d5549..6c7a62f3fe43 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -58,3 +58,4 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
#endif
#endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0ebaaf7f3568..2c171c40eb45 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -120,6 +120,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_CPUMAP,
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
+ BPF_MAP_TYPE_QUEUE,
};

enum bpf_prog_type {
@@ -255,6 +256,10 @@ enum bpf_attach_type {
/* Flag for stack_map, store build_id+offset instead of pointer */
#define BPF_F_STACK_BUILD_ID (1U << 5)

+/* Flags for queue_map, type of queue */
+#define BPF_F_QUEUE_FIFO (1U << 16)
+#define BPF_F_QUEUE_LIFO (2U << 16)
+
enum bpf_stack_build_id_status {
/* user space need an empty entry to identify end of a trace */
BPF_STACK_BUILD_ID_EMPTY = 0,
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f27f5496d6fe..30f02ef66635 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
obj-y := core.o

obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o queuemap.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/queuemap.c b/kernel/bpf/queuemap.c
new file mode 100644
index 000000000000..b4f870ac9f3a
--- /dev/null
+++ b/kernel/bpf/queuemap.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * queuemap.c: BPF queue map
+ *
+ * Copyright (c) 2018 Politecnico di Torino
+ */
+#include <linux/bpf.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include "percpu_freelist.h"
+
+#define QUEUE_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
+ BPF_F_QUEUE_FIFO | BPF_F_QUEUE_LIFO)
+
+enum queue_type {
+ QUEUE_FIFO = (BPF_F_QUEUE_FIFO >> 16),
+ QUEUE_LIFO = (BPF_F_QUEUE_LIFO >> 16),
+};
+
+struct bpf_queue {
+ struct bpf_map map;
+ struct list_head head;
+ struct pcpu_freelist freelist;
+ void *nodes;
+ enum queue_type type;
+ raw_spinlock_t lock;
+ atomic_t count;
+ u32 node_size;
+};
+
+struct queue_node {
+ struct pcpu_freelist_node fnode;
+ struct bpf_queue *queue;
+ struct list_head list;
+ struct rcu_head rcu;
+ char element[0] __aligned(8);
+};
+
+static bool queue_map_is_prealloc(struct bpf_queue *queue) {
+ return !(queue->map.map_flags & BPF_F_NO_PREALLOC);
+}
+
+/* Called from syscall */
+static int queue_map_alloc_check(union bpf_attr *attr)
+{
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 0 ||
+ attr->value_size == 0 || attr->map_flags & ~QUEUE_CREATE_FLAG_MASK)
+ return -EINVAL;
+
+ if ((attr->map_flags >> 16) != QUEUE_FIFO &&
+ (attr->map_flags >> 16) != QUEUE_LIFO) {
+ return -EINVAL;
+ }
+
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
+static int prealloc_init(struct bpf_queue *queue)
+{
+ u32 node_size = sizeof(struct queue_node) +
+ round_up(queue->map.value_size, 8);
+ u32 num_entries = queue->map.max_entries;
+ int err;
+
+ queue->nodes = bpf_map_area_alloc(node_size * num_entries,
+ queue->map.numa_node);
+ if (!queue->nodes)
+ return -ENOMEM;
+
+ err = pcpu_freelist_init(&queue->freelist);
+ if (err)
+ goto free_nodes;
+
+ pcpu_freelist_populate(&queue->freelist,
+ queue->nodes + offsetof(struct queue_node, fnode),
+ node_size, num_entries);
+
+ return 0;
+
+free_nodes:
+ bpf_map_area_free(queue->nodes);
+ return err;
+}
+
+static void prealloc_destroy(struct bpf_queue *queue)
+{
+ bpf_map_area_free(queue->nodes);
+ pcpu_freelist_destroy(&queue->freelist);
+}
+
+static struct bpf_map *queue_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_queue *queue;
+ u64 cost = sizeof(*queue);
+ int ret;
+
+ queue = kzalloc(sizeof(*queue), GFP_USER);
+ if (!queue)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&queue->map, attr);
+
+ queue->node_size = sizeof(struct queue_node) +
+ round_up(attr->value_size, 8);
+ cost += (u64) attr->max_entries * queue->node_size;
+ if (cost >= U32_MAX - PAGE_SIZE) {
+ ret = -E2BIG;
+ goto free_queue;
+ }
+
+ queue->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ ret = bpf_map_precharge_memlock(queue->map.pages);
+ if (ret)
+ goto free_queue;
+
+ INIT_LIST_HEAD(&queue->head);
+
+ raw_spin_lock_init(&queue->lock);
+
+ queue->type = attr->map_flags >> 16;
+
+ if (queue_map_is_prealloc(queue))
+ ret = prealloc_init(queue);
+ if (ret)
+ goto free_queue;
+
+ return &queue->map;
+
+free_queue:
+ kfree(queue);
+ return ERR_PTR(ret);
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void queue_map_free(struct bpf_map *map)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ struct queue_node *l;
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of queue_elem_free_rcu() callbacks for elements of this map may
+ * not have executed. Wait for them.
+ */
+ rcu_barrier();
+ if (!queue_map_is_prealloc(queue))
+ list_for_each_entry_rcu(l, &queue->head, list) {
+ list_del_rcu(&l->list);
+ kfree(l);
+ }
+ else
+ prealloc_destroy(queue);
+ kfree(queue);
+}
+
+static void queue_elem_free_rcu(struct rcu_head *head)
+{
+ struct queue_node *l = container_of(head, struct queue_node, rcu);
+ struct bpf_queue *queue = l->queue;
+
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
+ * we're calling kfree, otherwise deadlock is possible if kprobes
+ * are placed somewhere inside of slub
+ */
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ if (queue_map_is_prealloc(queue))
+ pcpu_freelist_push(&queue->freelist, &l->fnode);
+ else
+ kfree(l);
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+}
+
+/* Called from syscall or from eBPF program */
+static void *queue_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ unsigned long flags;
+ struct queue_node *node;
+
+ raw_spin_lock_irqsave(&queue->lock, flags);
+
+ node = list_first_or_null_rcu(&queue->head, struct queue_node, list);
+ if (!node) {
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+ return NULL;
+ }
+
+ if (!queue_map_is_prealloc(queue))
+ atomic_dec(&queue->count);
+
+ list_del_rcu(&node->list);
+ call_rcu(&node->rcu, queue_elem_free_rcu);
+
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+
+ return &node->element;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ unsigned long flags;
+ struct queue_node *new;
+
+ if (!queue_map_is_prealloc(queue)) {
+ if (atomic_inc_return(&queue->count) > queue->map.max_entries) {
+ atomic_dec(&queue->count);
+ return -E2BIG;
+ }
+
+ new = kmalloc_node(queue->node_size, GFP_ATOMIC | __GFP_NOWARN,
+ queue->map.numa_node);
+ if (!new) {
+ atomic_dec(&queue->count);
+ return -ENOMEM;
+ }
+ } else {
+ struct pcpu_freelist_node *l;
+
+ l = pcpu_freelist_pop(&queue->freelist);
+ if (!l)
+ return -E2BIG;
+ new = container_of(l, struct queue_node, fnode);
+ }
+
+ memcpy(new->element, value, queue->map.value_size);
+ new->queue = queue;
+
+ raw_spin_lock_irqsave(&queue->lock, flags);
+ switch (queue->type) {
+ case QUEUE_FIFO:
+ list_add_tail_rcu(&new->list, &queue->head);
+ break;
+
+ case QUEUE_LIFO:
+ list_add_rcu(&new->list, &queue->head);
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called from syscall */
+static int queue_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -EINVAL;
+}
+
+const struct bpf_map_ops queue_map_ops = {
+ .map_alloc_check = queue_map_alloc_check,
+ .map_alloc = queue_map_alloc,
+ .map_free = queue_map_free,
+ .map_lookup_elem = queue_map_lookup_elem,
+ .map_update_elem = queue_map_update_elem,
+ .map_delete_elem = queue_map_delete_elem,
+ .map_get_next_key = queue_map_get_next_key,
+};
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a31a1ba0f8ea..7e9a11d69eef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -622,11 +622,19 @@ static int map_lookup_elem(union bpf_attr *attr)
err = -EPERM;
goto err_put;
}
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ key = NULL;
}

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -709,10 +717,19 @@ static int map_update_elem(union bpf_attr *attr)
goto err_put;
}

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ key = NULL;
}

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -803,10 +820,19 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ key = NULL;
}

if (bpf_map_is_dev_bound(map)) {
@@ -855,9 +881,14 @@ static int map_get_next_key(union bpf_attr *attr)
}

if (ukey) {
- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ err = -EINVAL;
goto err_put;
}
} else {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 25e47c195874..b9e730090822 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1980,7 +1980,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
- type != expected_type)
+ type != expected_type && type != SCALAR_VALUE)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -2021,6 +2021,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
meta->map_ptr = reg->map_ptr;
} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ bool zero_size_allowed = false;
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
@@ -2034,8 +2035,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
+
+ if (meta->map_ptr->map_type == BPF_MAP_TYPE_QUEUE)
+ zero_size_allowed = true;
+
err = check_helper_mem_access(env, regno,
- meta->map_ptr->key_size, false,
+ meta->map_ptr->key_size,
+ zero_size_allowed,
NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:

Yonghong Song
 

On Thu, Aug 2, 2018 at 10:30 AM, Mauricio Vasquez
<mauricio.vasquez@...> wrote:
Bpf queue implements a LIFO/FIFO data containers for ebpf programs.

It allows to push an element to the queue by using the update operation
and to pop an element from the queue by using the lookup operation.

A use case for this is to keep track of a pool of elements, like
network ports in a SNAT.

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@...>
---
include/linux/bpf_types.h | 1
include/uapi/linux/bpf.h | 5 +
kernel/bpf/Makefile | 2
kernel/bpf/queuemap.c | 285 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 61 +++++++---
kernel/bpf/verifier.c | 10 +-
6 files changed, 346 insertions(+), 18 deletions(-)
create mode 100644 kernel/bpf/queuemap.c

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c5700c2d5549..6c7a62f3fe43 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -58,3 +58,4 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
#endif
#endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0ebaaf7f3568..2c171c40eb45 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -120,6 +120,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_CPUMAP,
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
+ BPF_MAP_TYPE_QUEUE,
};

enum bpf_prog_type {
@@ -255,6 +256,10 @@ enum bpf_attach_type {
/* Flag for stack_map, store build_id+offset instead of pointer */
#define BPF_F_STACK_BUILD_ID (1U << 5)

+/* Flags for queue_map, type of queue */
+#define BPF_F_QUEUE_FIFO (1U << 16)
+#define BPF_F_QUEUE_LIFO (2U << 16)
+
enum bpf_stack_build_id_status {
/* user space need an empty entry to identify end of a trace */
BPF_STACK_BUILD_ID_EMPTY = 0,
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f27f5496d6fe..30f02ef66635 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
obj-y := core.o

obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o queuemap.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/queuemap.c b/kernel/bpf/queuemap.c
new file mode 100644
index 000000000000..b4f870ac9f3a
--- /dev/null
+++ b/kernel/bpf/queuemap.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * queuemap.c: BPF queue map
+ *
+ * Copyright (c) 2018 Politecnico di Torino
+ */
+#include <linux/bpf.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include "percpu_freelist.h"
+
+#define QUEUE_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
+ BPF_F_QUEUE_FIFO | BPF_F_QUEUE_LIFO)
+
+enum queue_type {
+ QUEUE_FIFO = (BPF_F_QUEUE_FIFO >> 16),
+ QUEUE_LIFO = (BPF_F_QUEUE_LIFO >> 16),
+};
+
+struct bpf_queue {
+ struct bpf_map map;
+ struct list_head head;
+ struct pcpu_freelist freelist;
+ void *nodes;
+ enum queue_type type;
+ raw_spinlock_t lock;
+ atomic_t count;
+ u32 node_size;
+};
+
+struct queue_node {
+ struct pcpu_freelist_node fnode;
+ struct bpf_queue *queue;
+ struct list_head list;
+ struct rcu_head rcu;
+ char element[0] __aligned(8);
+};
+
+static bool queue_map_is_prealloc(struct bpf_queue *queue) {
+ return !(queue->map.map_flags & BPF_F_NO_PREALLOC);
+}
+
+/* Called from syscall */
+static int queue_map_alloc_check(union bpf_attr *attr)
+{
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 0 ||
+ attr->value_size == 0 || attr->map_flags & ~QUEUE_CREATE_FLAG_MASK)
+ return -EINVAL;
+
+ if ((attr->map_flags >> 16) != QUEUE_FIFO &&
+ (attr->map_flags >> 16) != QUEUE_LIFO) {
+ return -EINVAL;
+ }
+
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
+static int prealloc_init(struct bpf_queue *queue)
+{
+ u32 node_size = sizeof(struct queue_node) +
+ round_up(queue->map.value_size, 8);
+ u32 num_entries = queue->map.max_entries;
+ int err;
+
+ queue->nodes = bpf_map_area_alloc(node_size * num_entries,
+ queue->map.numa_node);
+ if (!queue->nodes)
+ return -ENOMEM;
+
+ err = pcpu_freelist_init(&queue->freelist);
+ if (err)
+ goto free_nodes;
+
+ pcpu_freelist_populate(&queue->freelist,
+ queue->nodes + offsetof(struct queue_node, fnode),
+ node_size, num_entries);
+
+ return 0;
+
+free_nodes:
+ bpf_map_area_free(queue->nodes);
+ return err;
+}
+
+static void prealloc_destroy(struct bpf_queue *queue)
+{
+ bpf_map_area_free(queue->nodes);
+ pcpu_freelist_destroy(&queue->freelist);
+}
+
+static struct bpf_map *queue_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_queue *queue;
+ u64 cost = sizeof(*queue);
+ int ret;
+
+ queue = kzalloc(sizeof(*queue), GFP_USER);
+ if (!queue)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&queue->map, attr);
+
+ queue->node_size = sizeof(struct queue_node) +
+ round_up(attr->value_size, 8);
+ cost += (u64) attr->max_entries * queue->node_size;
+ if (cost >= U32_MAX - PAGE_SIZE) {
+ ret = -E2BIG;
+ goto free_queue;
+ }
+
+ queue->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ ret = bpf_map_precharge_memlock(queue->map.pages);
+ if (ret)
+ goto free_queue;
+
+ INIT_LIST_HEAD(&queue->head);
+
+ raw_spin_lock_init(&queue->lock);
+
+ queue->type = attr->map_flags >> 16;
+
+ if (queue_map_is_prealloc(queue))
+ ret = prealloc_init(queue);
+ if (ret)
+ goto free_queue;
+
+ return &queue->map;
+
+free_queue:
+ kfree(queue);
+ return ERR_PTR(ret);
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void queue_map_free(struct bpf_map *map)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ struct queue_node *l;
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of queue_elem_free_rcu() callbacks for elements of this map may
+ * not have executed. Wait for them.
+ */
+ rcu_barrier();
+ if (!queue_map_is_prealloc(queue))
+ list_for_each_entry_rcu(l, &queue->head, list) {
+ list_del_rcu(&l->list);
+ kfree(l);
+ }
+ else
+ prealloc_destroy(queue);
+ kfree(queue);
+}
+
+static void queue_elem_free_rcu(struct rcu_head *head)
+{
+ struct queue_node *l = container_of(head, struct queue_node, rcu);
+ struct bpf_queue *queue = l->queue;
+
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
+ * we're calling kfree, otherwise deadlock is possible if kprobes
+ * are placed somewhere inside of slub
+ */
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ if (queue_map_is_prealloc(queue))
+ pcpu_freelist_push(&queue->freelist, &l->fnode);
+ else
+ kfree(l);
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+}
+
+/* Called from syscall or from eBPF program */
+static void *queue_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ unsigned long flags;
+ struct queue_node *node;
+
+ raw_spin_lock_irqsave(&queue->lock, flags);
+
+ node = list_first_or_null_rcu(&queue->head, struct queue_node, list);
+ if (!node) {
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+ return NULL;
+ }
+
+ if (!queue_map_is_prealloc(queue))
+ atomic_dec(&queue->count);
+
+ list_del_rcu(&node->list);
+ call_rcu(&node->rcu, queue_elem_free_rcu);
+
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+
+ return &node->element;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_queue *queue = container_of(map, struct bpf_queue, map);
+ unsigned long flags;
+ struct queue_node *new;
+
+ if (!queue_map_is_prealloc(queue)) {
+ if (atomic_inc_return(&queue->count) > queue->map.max_entries) {
+ atomic_dec(&queue->count);
+ return -E2BIG;
+ }
+
+ new = kmalloc_node(queue->node_size, GFP_ATOMIC | __GFP_NOWARN,
+ queue->map.numa_node);
+ if (!new) {
+ atomic_dec(&queue->count);
+ return -ENOMEM;
+ }
+ } else {
+ struct pcpu_freelist_node *l;
+
+ l = pcpu_freelist_pop(&queue->freelist);
+ if (!l)
+ return -E2BIG;
+ new = container_of(l, struct queue_node, fnode);
+ }
+
+ memcpy(new->element, value, queue->map.value_size);
+ new->queue = queue;
+
+ raw_spin_lock_irqsave(&queue->lock, flags);
+ switch (queue->type) {
+ case QUEUE_FIFO:
+ list_add_tail_rcu(&new->list, &queue->head);
+ break;
+
+ case QUEUE_LIFO:
+ list_add_rcu(&new->list, &queue->head);
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&queue->lock, flags);
+
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called from syscall */
+static int queue_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -EINVAL;
+}
+
+const struct bpf_map_ops queue_map_ops = {
+ .map_alloc_check = queue_map_alloc_check,
+ .map_alloc = queue_map_alloc,
+ .map_free = queue_map_free,
+ .map_lookup_elem = queue_map_lookup_elem,
+ .map_update_elem = queue_map_update_elem,
+ .map_delete_elem = queue_map_delete_elem,
+ .map_get_next_key = queue_map_get_next_key,
+};
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a31a1ba0f8ea..7e9a11d69eef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -622,11 +622,19 @@ static int map_lookup_elem(union bpf_attr *attr)
err = -EPERM;
goto err_put;
}
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ key = NULL;
}

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -709,10 +717,19 @@ static int map_update_elem(union bpf_attr *attr)
goto err_put;
}

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ key = NULL;
}

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -803,10 +820,19 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}

- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ if (ukey) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ key = NULL;
}

if (bpf_map_is_dev_bound(map)) {
@@ -855,9 +881,14 @@ static int map_get_next_key(union bpf_attr *attr)
}

if (ukey) {
- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
+ if (map->map_type != BPF_MAP_TYPE_QUEUE) {
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
+ } else {
+ err = -EINVAL;
goto err_put;
}
} else {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 25e47c195874..b9e730090822 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1980,7 +1980,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
- type != expected_type)
+ type != expected_type && type != SCALAR_VALUE)
Maybe we should "&& (arg_type == ARG_PTR_TO_MAP_VALUE || type != SCALAR_VALUE)"?

goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -2021,6 +2021,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
meta->map_ptr = reg->map_ptr;
} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ bool zero_size_allowed = false;
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
@@ -2034,8 +2035,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
+
+ if (meta->map_ptr->map_type == BPF_MAP_TYPE_QUEUE)
+ zero_size_allowed = true;
+
err = check_helper_mem_access(env, regno,
- meta->map_ptr->key_size, false,
+ meta->map_ptr->key_size,
+ zero_size_allowed,
NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call: