[RFC PATCHv2 04/13] lib/bpf: add support for managing bpf program/map.


William Tu
 

From: Joe Stringer <joe@...>

Through libbpf, the patch adds support for loading bpf program
and maps, pinning the program and map to /sys/fs/bpf/ovs/, managing
the file descriptor of each loaded map, and printting.

Signed-off-by: Joe Stringer <joe@...>
Co-authored-by: William Tu <u9012063@...>
Co-authored-by: Yifeng Sun <pkusunyifeng@...>
---
lib/bpf.c | 524 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/bpf.h | 69 +++++++++
2 files changed, 593 insertions(+)
create mode 100644 lib/bpf.c
create mode 100644 lib/bpf.h

diff --git a/lib/bpf.c b/lib/bpf.c
new file mode 100644
index 000000000000..48c677e54659
--- /dev/null
+++ b/lib/bpf.c
@@ -0,0 +1,524 @@
+/*
+ * Copyright (c) 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <linux/bpf.h>
+#include <linux/limits.h>
+#include <linux/magic.h>
+#include <iproute2/bpf_elf.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+#include <sys/resource.h>
+
+#include "bpf.h"
+#include "bpf/odp-bpf.h"
+#include "util.h"
+#include "openvswitch/dynamic-string.h"
+#include "openvswitch/vlog.h"
+
+#define BPF_FS_PATH "/sys/fs/bpf/ovs/"
+static const char *ovs_bpf_path = BPF_FS_PATH;
+
+#define MAX_BPF_PROG_ARRAY 64 //FIXME
+VLOG_DEFINE_THIS_MODULE(bpf);
+
+static void
+bpf_format_prog(struct ds *ds, const struct bpf_prog *prog)
+{
+ ds_put_format(ds, " %s:\n", prog->name);
+ ds_put_format(ds, " handle: %08"PRIx32"\n", prog->handle);
+}
+
+typedef void map_element_writer_t(struct ds *, uint64_t, void *);
+
+static void
+format_dp_stats(struct ds *ds, uint64_t key, void *value_)
+{
+ uint64_t value = *(uint64_t *)value_;
+
+ switch (key) {
+ case OVS_DP_STATS_UNSPEC:
+ while (ds_chomp(ds, ' ')) {
+ /* nom nom nom */
+ }
+ break;
+ case OVS_DP_STATS_HIT:
+ ds_put_cstr(ds, "hit");
+ break;
+ case OVS_DP_STATS_MISSED:
+ ds_put_cstr(ds, "missed");
+ break;
+ case OVS_DP_STATS_LOST:
+ ds_put_cstr(ds, "lost");
+ break;
+ case OVS_DP_STATS_FLOWS:
+ ds_put_cstr(ds, "flows");
+ break;
+ case OVS_DP_STATS_MASK_HIT:
+ ds_put_cstr(ds, "masks_hit");
+ break;
+ case OVS_DP_STATS_MASKS:
+ ds_put_cstr(ds, "masks");
+ break;
+ case OVS_DP_STATS_ERRORS:
+ ds_put_cstr(ds, "errors");
+ break;
+ default:
+ ds_put_format(ds, "unknown-%"PRIu64, key);
+ break;
+ }
+ if (key) {
+ ds_put_format(ds, ": %"PRIu64"\n", value);
+ }
+}
+
+static void
+format_upcalls(struct ds *ds, uint64_t key, void *value OVS_UNUSED)
+{
+ ds_put_format(ds, "cpu-%"PRIu64"\n", key);
+}
+
+static void
+format_tailcalls(struct ds *ds, uint64_t key, void *value_)
+{
+ uint32_t value = *(uint32_t *)value_;
+ ds_put_format(ds, "index-%"PRIu64"prog_fd-%d\n", key, value);
+}
+
+static int
+lookup_elem(int fd, void *key, size_t key_len, void *value)
+{
+ int err = bpf_map_lookup_elem(fd, (uint64_t *)key, (uint64_t *)value);
+ if (err) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+
+ ds_put_cstr(&ds, "error occurred looking up elem ");
+ ds_put_hex(&ds, key, key_len);
+ ds_put_format(&ds, ": %s", ovs_strerror(errno));
+ VLOG_DBG("%s", ds_cstr(&ds));
+ ds_destroy(&ds);
+ }
+
+ return err;
+}
+
+#define MAP_FORMAT_FUNC(NAME, KTYPE, VTYPE, PRINT_COUNT) \
+ static void NAME(struct ds *ds, const struct bpf_map *map, \
+ map_element_writer_t fmt) \
+ { \
+ KTYPE key = 0; \
+ VTYPE value; \
+ int count = 0; \
+ \
+ VLOG_DBG("reading map %s", map->name); \
+ ds_put_format(ds, " %s:\n", map->name); \
+ if (!lookup_elem(map->fd, &key, sizeof key, &value)) { \
+ count++; \
+ if (fmt) { \
+ ds_put_cstr(ds, " "); \
+ fmt(ds, key, &value); \
+ } \
+ } \
+ while (!bpf_map_get_next_key(map->fd, &key, &key)) { \
+ count++; \
+ if (fmt) { \
+ if (!lookup_elem(map->fd, &key, sizeof key, &value)) { \
+ ds_put_cstr(ds, " "); \
+ fmt(ds, key, &value); \
+ } \
+ } \
+ }; \
+ if (PRINT_COUNT) { \
+ ds_put_format(ds, " count: %d\n", count); \
+ } \
+ }
+
+MAP_FORMAT_FUNC(bpf_format_map_stats, uint64_t, uint64_t, false);
+MAP_FORMAT_FUNC(bpf_format_map_flows, uint64_t, struct bpf_flow, true);
+MAP_FORMAT_FUNC(bpf_format_map_upcalls, uint32_t, uint32_t, true);
+MAP_FORMAT_FUNC(bpf_format_map_tailcalls, uint32_t, uint32_t, true);//FIXME
+//MAP_FORMAT_FUNC(bpf_format_map_dp_flow_stats,
+
+void
+bpf_format_state(struct ds *ds, struct bpf_state *state)
+{
+ ds_put_format(ds, "path: %s\n", ovs_bpf_path);
+ ds_put_cstr(ds, "maps:\n");
+ bpf_format_map_stats(ds, &state->datapath_stats, format_dp_stats);
+ bpf_format_map_flows(ds, &state->flow_table, NULL);
+ bpf_format_map_upcalls(ds, &state->upcalls, format_upcalls);
+ bpf_format_map_tailcalls(ds, &state->tailcalls, format_tailcalls);
+ //bpf_format_map_dp_flow_stats(ds, &state->dp_flow_stats, NULL);
+ ds_put_cstr(ds, "programs:\n");
+ bpf_format_prog(ds, &state->downcall);
+ bpf_format_prog(ds, &state->egress);
+ bpf_format_prog(ds, &state->ingress);
+ bpf_format_prog(ds, &state->xdp);
+}
+
+/* Populates 'state' with the standard set of programs and maps for openvswitch
+ * datapath as sourced from pinned programs at ovs_bpf_path.
+ *
+ * Returns 0 on success, or positive errno on error. If successful, the caller
+ * is resposible for releasing the resources in 'state' via bpf_put().
+ */
+int
+bpf_get(struct bpf_state *state, bool verbose)
+{
+ const struct {
+ int *fd;
+ const char *path;
+ } objs[] = {
+ /* BPF Programs */
+ {&state->ingress.fd, "ingress/0"},
+ {&state->egress.fd, "egress/0"},
+ {&state->downcall.fd, "downcall/0"},
+ {&state->xdp.fd, "xdp/0"},
+ /* BPF Maps */
+ {&state->upcalls.fd, "upcalls"},
+ {&state->flow_table.fd, "flow_table"},
+ {&state->datapath_stats.fd, "datapath_stats"},
+ {&state->tailcalls.fd, "tailcalls"},
+ {&state->execute_actions.fd, "execute_actions"},
+ {&state->dp_flow_stats.fd, "dp_flow_stats"},
+ };
+ int i, k, error = 0;
+ char buf[BUFSIZ];
+ int prog_array_fd;
+
+ for (i = 0; i < ARRAY_SIZE(objs); i++) {
+ struct stat s;
+
+ //Failed to load /sys/fs/bpf/ovs/progs/ingress_0:
+ snprintf(buf, ARRAY_SIZE(buf), "%s/%s", ovs_bpf_path, objs[i].path);
+ if (stat(buf, &s)) {
+ error = errno;
+ break;
+ }
+ error = bpf_obj_get(buf);
+ if (error > 0) {
+ VLOG_DBG("Loaded BPF object at %s fd %d", buf, error);
+ *objs[i].fd = error;
+ error = 0;
+ continue;
+ } else {
+ error = errno;
+ break;
+ }
+ }
+
+ prog_array_fd = state->tailcalls.fd;
+
+ VLOG_DBG("start loading/pinning program array\n");
+ for (k = 0; k < BPF_MAX_PROG_ARRAY; k++) {
+ struct stat s;
+ int prog_fd;
+
+ state->tailarray[k].fd = 0;
+
+ snprintf(buf, ARRAY_SIZE(buf), "%s/tail-%d/0", ovs_bpf_path, k);
+ if (stat(buf, &s)) {
+ continue;
+ }
+
+ prog_fd = bpf_obj_get(buf);
+ if (prog_fd > 0) {
+ VLOG_DBG("Loaded BPF object at %s", buf);
+ state->tailarray[k].fd = prog_fd;
+ error = bpf_map_update_elem(prog_array_fd, &k, &prog_fd, BPF_ANY);
+ if (error < 0) {
+ VLOG_ERR("Can not add %s into BPF_MAP_PROG_ARRAY\n", buf);
+ break;
+ }
+ } else {
+ error = errno;
+ break;
+ }
+ }
+
+ if (error) {
+ VLOG(verbose ? VLL_WARN : VLL_DBG, "Failed to load %s: %s",
+ buf, ovs_strerror(error));
+
+ for (int j = 0; j < i; j++) {
+ close(*objs[j].fd);
+ *objs[j].fd = 0;
+ }
+
+ for (int j = 0; j < BPF_MAX_PROG_ARRAY; j++) {
+ if (state->tailarray[j].fd)
+ close(state->tailarray[j].fd);
+ }
+ }
+
+ if (!error) {
+ state->ingress.handle = INGRESS_HANDLE;
+ state->ingress.name = xstrdup("ovs_cls_ingress");
+ state->egress.handle = EGRESS_HANDLE;
+ state->egress.name = xstrdup("ovs_cls_egress");
+ state->downcall.handle = INGRESS_HANDLE;
+ state->downcall.name = xstrdup("ovs_cls_downcall");
+ state->upcalls.name = xstrdup("upcalls");
+ state->xdp.name = xstrdup("xdp");
+ state->flow_table.name = xstrdup("flow_table");
+ state->datapath_stats.name = xstrdup("datapath_stats");
+ state->dp_flow_stats.name = xstrdup("dp_flow_stats");
+ // add parser, lookup, action, deparser
+ state->tailcalls.name = xstrdup("tailcalls");
+
+ }
+
+ return error;
+}
+
+static void
+xclose(int fd, const char *name)
+{
+ int error = close(fd);
+ if (error) {
+ VLOG_WARN("Failed to close BPF fd %s: %s", name, ovs_strerror(errno));
+ }
+}
+
+/* Frees resources allocated by bpf_put(). */
+void
+bpf_put(struct bpf_state *state)
+{
+ xclose(state->ingress.fd, state->ingress.name);
+ xclose(state->egress.fd, state->egress.name);
+ xclose(state->downcall.fd, state->downcall.name);
+ xclose(state->upcalls.fd, state->upcalls.name);
+ xclose(state->xdp.fd, state->xdp.name);
+ xclose(state->flow_table.fd, "ovs_map_flow_table");
+ xclose(state->datapath_stats.fd, "ovs_datapath_stats");
+ xclose(state->dp_flow_stats.fd, state->dp_flow_stats.name);
+ free((void *)state->ingress.name);
+ free((void *)state->egress.name);
+ free((void *)state->downcall.name);
+ free((void *)state->upcalls.name);
+ free((void *)state->xdp.name);
+ free((void *)state->flow_table.name);
+ free((void *)state->datapath_stats.name);
+ free((void *)state->dp_flow_stats.name);
+}
+
+static void
+process(struct bpf_object *obj)
+{
+ struct bpf_program *prog;
+ struct bpf_map *map;
+
+ VLOG_DBG("Opened object '%s'\n", bpf_object__name(obj));
+ VLOG_DBG("Programs:\n");
+ bpf_object__for_each_program(prog, obj) {
+ const char *title = bpf_program__title(prog, false);
+ int error;
+
+ VLOG_DBG(" - %s\n", title);
+ if (strstr(title, "xdp")) {
+ error = bpf_program__set_xdp(prog);
+ } else {
+ error = bpf_program__set_sched_cls(prog); // or sched_act?
+ }
+ if (error) {
+ VLOG_WARN("Failed to set '%s' prog type: %s\n", title,
+ ovs_strerror(error));
+ }
+
+ }
+
+ if (VLOG_IS_DBG_ENABLED()) {
+ VLOG_DBG("Maps:\n");
+ bpf_map__for_each(map, obj) {
+ const char *name = bpf_map__name(map);
+ VLOG_DBG(" - %s\n", name);
+ }
+ }
+}
+
+/* Attempts to load the BPF datapath in the form of an ELF compiled for the BPF
+ * ISA in 'path', install it into the kernel, and pin it to the filesystem
+ * under ovs_bpf_path/{maps,progs}/foo.
+ *
+ * Returns 0 on success, or positive errno on error.
+ */
+int
+bpf_load(const char *path)
+{
+ const char *stage = NULL;
+ struct bpf_state state;
+ struct bpf_object *obj;
+ long error;
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+
+ if ((error = setrlimit(RLIMIT_MEMLOCK, &r))) {
+ VLOG_ERR("Failed to set rlimit %s", ovs_strerror(error));
+ return error;
+ }
+
+ if (!bpf_get(&state, false)) {
+ /* XXX: Restart; Upgrade */
+ VLOG_INFO("Re-using preloaded BPF datapath");
+ bpf_put(&state);
+ return 0;
+ }
+
+ obj = bpf_object__open(path);
+ error = libbpf_get_error(obj);
+ if (error) {
+ stage = "open";
+ goto out;
+ }
+ process(obj);
+ error = bpf_object__load(obj);
+ if (error) {
+ stage = "load";
+ goto close;
+ }
+ error = bpf_object__pin(obj, ovs_bpf_path);
+ if (error) {
+ stage = "pin";
+ goto close;
+ }
+
+ error = bpf_object__unload(obj);
+ if (error) {
+ stage = "unload";
+ goto close;
+ }
+
+close:
+ bpf_object__close(obj);
+out:
+ if (error < 0) {
+ error = -error;
+ } else if (!error) {
+ VLOG_DBG("Loaded BPF datapath from %s", path);
+ }
+ if (error > __LIBBPF_ERRNO__START && error < __LIBBPF_ERRNO__END) {
+ char buf[BUFSIZ];
+
+ libbpf_strerror(error, buf, ARRAY_SIZE(buf));
+ VLOG_WARN("Failed to %s BPF datapath: %s\n", stage ? stage : "", buf);
+ error = EINVAL;
+ }
+ return error;
+}
+
+#define PRINT_FN(NAME) \
+static int \
+print_##NAME(const char *fmt, ...) \
+{ \
+ va_list args; \
+ \
+ va_start(args, fmt); \
+ vlog_valist(&this_module, VLL_##NAME, fmt, args); \
+ va_end(args); \
+ return 0; \
+}
+
+PRINT_FN(WARN);
+PRINT_FN(INFO);
+PRINT_FN(DBG);
+
+#define stringize(x) #x
+
+static int OVS_UNUSED
+mount_bpf(void)
+{
+ struct statfs st_fs;
+ char path[PATH_MAX];
+ char type[NAME_MAX];
+ int err = 0;
+ FILE *fp;
+ int idx;
+
+ fp = fopen("/proc/mounts", "r");
+ if (fp) {
+ const char *fmt;
+ int match;
+
+ fmt = "%*s %"stringize(PATH_MAX)"s %#"stringize(NAME_MAX)"s %*s\n";
+ for (match = 0; match != EOF; match = fscanf(fp, fmt, path, type)) {
+ if (match == 2 && !strcmp(type, "bpf"))
+ break;
+ }
+ if (fclose(fp)) {
+ err = errno;
+ VLOG_INFO("Failed to close /proc/mounts: %s", ovs_strerror(err));
+ }
+ if (strcmp(type, "bpf")) {
+ err = errno;
+ VLOG_DBG("Couldn't find bpf mountpoint in /proc/mounts");
+ }
+ } else {
+ err = errno;
+ VLOG_INFO("Cannot open /proc/mounts: %s", ovs_strerror(err));
+ }
+ if (err || strlen(path) == 0) {
+ VLOG_DBG("Using %s for BPF filesystem mountpoint", BPF_FS_PATH);
+ strcpy(path, BPF_FS_PATH);
+ }
+
+ if (!statfs(path, &st_fs) && st_fs.f_type == BPF_FS_MAGIC) {
+ VLOG_INFO("BPF filesystem already mounted to %s", path);
+ return 0;
+ }
+
+ if (mkdir(path, 0755) && errno != EEXIST) {
+ VLOG_WARN("Failed to create %s: %s", path, ovs_strerror(errno));
+ return errno;
+ }
+
+ if (mount("bpf", path, "bpf", 0, NULL)) {
+ VLOG_WARN("Failed to mount BPF filesystem: %s", ovs_strerror(errno));
+ return errno;
+ }
+
+ idx = strlen(path);
+ if (idx >= PATH_MAX - strlen("/ovs")) {
+ VLOG_WARN("BPF filesystem path \"%s\" is too long.", path);
+ return ENAMETOOLONG;
+ } else {
+ strncpy(&path[idx], "/ovs", strlen("/ovs"));
+ }
+
+ if (mkdir(path, 0755) && errno != EEXIST) {
+ VLOG_WARN("Failed to create %s: %s", path, ovs_strerror(errno));
+ return errno;
+ }
+
+ if (ovs_bpf_path) {
+ free(CONST_CAST(char *, ovs_bpf_path));
+ }
+ ovs_bpf_path = xstrdup(path);
+ return 0;
+}
+
+int
+bpf_init(void)
+{
+ libbpf_set_print(print_WARN, print_INFO, print_DBG);
+ /* skip using mount_bpf */
+ return 0;
+}
diff --git a/lib/bpf.h b/lib/bpf.h
new file mode 100644
index 000000000000..4b5afaf4f77f
--- /dev/null
+++ b/lib/bpf.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIB_BPF_H
+#define LIB_BPF_H 1
+
+#include <errno.h>
+#include "openvswitch/compiler.h"
+
+#define INGRESS_HANDLE 0xFFFFFFF2
+#define EGRESS_HANDLE 0xFFFFFFF3
+
+struct bpf_prog {
+ const char *name;
+ uint32_t handle; /* tc handle */
+ int fd;
+};
+
+struct bpf_map {
+ const char *name;
+ int fd;
+};
+
+#if HAVE_BPF
+struct bpf_state;
+struct ds;
+
+#define BPF_MAX_PROG_ARRAY 64
+struct bpf_state {
+ /* File descriptors for programs. */
+ struct bpf_prog ingress; /* BPF_PROG_TYPE_SCHED_CLS */
+ struct bpf_prog egress; /* BPF_PROG_TYPE_SCHED_CLS */
+ struct bpf_prog downcall; /* BPF_PROG_TYPE_SCHED_CLS */
+ struct bpf_prog tailarray[BPF_MAX_PROG_ARRAY];
+ struct bpf_prog xdp; /* BPF_PROG_TYPE_XDP */
+ // william: struct bpf_prog parser, deparser, action,
+
+ struct bpf_map upcalls; /* BPF_MAP_TYPE_PERF_ARRAY */
+ struct bpf_map flow_table; /* BPF_MAP_TYPE_HASH */
+ struct bpf_map datapath_stats; /* BPF_MAP_TYPE_ARRAY */
+ struct bpf_map tailcalls; /* BPF_PROG_TYPE_PROG_ARRARY */
+ struct bpf_map execute_actions; /* BPF_MAP_TYPE_ARRAY */
+ struct bpf_map dp_flow_stats; /* BPF_MAP_TYPE_HASH */
+};
+
+int bpf_get(struct bpf_state *state, bool verbose);
+void bpf_put(struct bpf_state *state);
+int bpf_load(const char *path);
+int bpf_init(void);
+void bpf_format_state(struct ds *ds, struct bpf_state *state);
+#else /* !HAVE_BPF */
+static inline int bpf_load(const char *path OVS_UNUSED) { return EOPNOTSUPP; }
+static inline int bpf_init(void) { return 0; }
+#endif /* HAVE_BPF */
+
+#endif /* LIB_BPF_H */
--
2.7.4