Topics

[RFC PATCH 05/11] dpif: add 'dpif-bpf' provider.

William Tu
 

From: Joe Stringer <joe@...>

Implement a new datapath interface for use with BPF datapaths.

Like dpif-netlink, dpif-bpf is backed by an implementation which resides
within the kernel. It uses the BPF functionality available in recent
versions of Linux to create the datapath. Unlike dpif-netlink there is no
datapath notion of a bridge with ports attached; dpif-bpf is implemented
by attaching BPF programs directly to individual devices using TC.

Upcalls are implemented using a perf event ringbuffer, which is polled
by handler threads. Flow execution is implemented by sending the packet
plus metadata on a dedicated tap device, where there is an BPF program
that understands the format of the packet coming from userspace. When
this device receives a message, it strips the metadata, uses it to
determine how to execute the packet, then forwards the packet onwards.

This initial implementation has a number of limitations which are
expected to go away over time:
* The set of matches and actions supported by the datapath is not
as wide as the full set known by OVS, so if a flow cannot be
expressed in the current eBPF API, OVS will log errors and return
errors during flow put.
* Only the input port and packet length is passed as metadata from
the datapath to userspace during upcall. Key extraction is done
purely from the packet provided from the datapath.
* Conversely, only the output port is sent down during execution.
No other actions are supported currently; and only one output is
supported.
* Ingress policing cannot be configured on BPF
datapath devices.
* On startup, if the OVS BPF datapath is already loaded into the
kernel and pinned to the filesystem, it will reuse this datapath,
even if the datapath is out-of-date.

Documentation/intro/install/bpf.rst contains further information on how
to build and use the bpf datapath.

For more details on the design and implementation, see our OSR paper:
[1] https://dl.acm.org/citation.cfm?id=3139657
[2] http://openvswitch.org/support/ovscon2016/7/1120-tu.pdf

Signed-off-by: Joe Stringer <joe@...>
Signed-off-by: William Tu <u9012063@...>
Signed-off-by: Yifeng Sun <pkusunyifeng@...>
Co-authored-by: William Tu <u9012063@...>
Co-authored-by: Yifeng Sun <pkusunyifeng@...>
---
lib/dpif-bpf.c | 1995 +++++++++++++++++++++++++++++++++++++++++++++++++++
lib/dpif-provider.h | 1 +
lib/dpif.c | 3 +
3 files changed, 1999 insertions(+)
create mode 100644 lib/dpif-bpf.c

diff --git a/lib/dpif-bpf.c b/lib/dpif-bpf.c
new file mode 100644
index 000000000000..d0931af78278
--- /dev/null
+++ b/lib/dpif-bpf.c
@@ -0,0 +1,1995 @@
+/*
+ * Copyright (c) 2016, 2017, 2018 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include <errno.h>
+#include <openvswitch/hmap.h>
+#include <openvswitch/types.h>
+#include <openvswitch/vlog.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+
+#include "bpf.h"
+#include "bpf/odp-bpf.h"
+#include "dirs.h"
+#include "dpif.h"
+#include "dpif-provider.h"
+#include "dpif-bpf-odp.h"
+#include "dpif-netlink-rtnl.h"
+#include "fat-rwlock.h"
+#include "netdev.h"
+#include "netdev-provider.h"
+#include "netdev-vport.h"
+#include "odp-util.h"
+#include "ovs-numa.h"
+#include "perf-event.h"
+#include "sset.h"
+#include "openvswitch/poll-loop.h"
+
+VLOG_DEFINE_THIS_MODULE(dpif_bpf);
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
+
+/* Protects against changes to 'bpf_datapaths'. */
+static struct ovs_mutex bpf_datapath_mutex = OVS_MUTEX_INITIALIZER;
+
+/* Contains all 'struct dpif_bpf_dp's. */
+static struct shash bpf_datapaths OVS_GUARDED_BY(bpf_datapath_mutex)
+ = SHASH_INITIALIZER(&bpf_datapaths);
+
+struct bpf_handler {
+ /* Into owning dpif_bpf_dp->channels */
+ int offset;
+ int count;
+ int index; /* next channel to use */
+};
+
+struct dpif_bpf_dp {
+ struct dpif *dpif;
+ const char *const name;
+ struct ovs_refcount ref_cnt;
+ atomic_flag destroyed;
+
+ /* Ports.
+ *
+ * Any lookup into 'ports' requires taking 'port_mutex'. */
+ struct ovs_mutex port_mutex;
+ struct hmap ports_by_odp OVS_GUARDED;
+ struct hmap ports_by_ifindex OVS_GUARDED;
+ struct seq *port_seq; /* Incremented whenever a port changes. */
+ uint64_t last_seq;
+
+ /* Handlers */
+ struct fat_rwlock upcall_lock;
+ uint32_t n_handlers;
+ struct bpf_handler *handlers;
+
+ /* Upcall channels. */
+ size_t page_size;
+ int n_pages;
+ int n_channels;
+ struct perf_channel channels[];
+};
+
+struct dpif_bpf {
+ struct dpif dpif;
+ struct dpif_bpf_dp *dp;
+};
+
+struct dpif_bpf_port {
+ struct hmap_node odp_node; /* Node in dpif_bpf_dp 'ports_by_odp'. */
+ struct hmap_node if_node; /* Node in dpif_bpf_dp 'ports_by_ifindex'. */
+ struct netdev *netdev;
+ odp_port_t port_no;
+ int ifindex;
+ char *type; /* Port type as requested by user. */
+ struct netdev_saved_flags *sf;
+
+ unsigned n_rxq;
+ struct netdev_rxq **rxqs;
+};
+
+static void vlog_hex_dump(const u8 *buf, size_t count)
+{
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ ds_put_hex_dump(&ds, buf, count, 0, false);
+ VLOG_DBG("\n%s", ds_cstr(&ds));
+ ds_destroy(&ds);
+}
+
+int create_dp_bpf(const char *name, struct dpif_bpf_dp **dp);
+static void dpif_bpf_close(struct dpif *dpif);
+static int do_add_port(struct dpif_bpf_dp *dp, const char *devname,
+ const char *type, odp_port_t port_no)
+ OVS_REQUIRES(dp->port_mutex);
+static void do_del_port(struct dpif_bpf_dp *dp, struct dpif_bpf_port *port)
+ OVS_REQUIRES(dp->port_mutex);
+static int dpif_bpf_delete_all_flow(void);
+
+static struct dpif_bpf *
+dpif_bpf_cast(const struct dpif *dpif)
+{
+ ovs_assert(dpif->dpif_class == &dpif_bpf_class);
+ return CONTAINER_OF(dpif, struct dpif_bpf, dpif);
+}
+
+static struct dpif_bpf_dp *
+get_dpif_bpf_dp(const struct dpif *dpif)
+{
+ return dpif_bpf_cast(dpif)->dp;
+}
+
+static struct dp_bpf {
+ struct bpf_state bpf;
+ struct netdev *outport; /* Used for downcall. */
+} datapath;
+
+static int
+configure_outport(struct netdev *outport)
+{
+ int error;
+
+ error = netdev_set_filter(outport, &datapath.bpf.downcall);
+ if (error) {
+ return error;
+ }
+
+ error = netdev_set_flags(outport, NETDEV_UP, NULL);
+ if (error) {
+ return error;
+ }
+
+ return 0;
+}
+
+static int
+dpif_bpf_init(void)
+{
+ static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+ static int error = 0;
+
+ if (ovsthread_once_start(&once)) {
+ struct netdev *outport;
+
+ error = bpf_get(&datapath.bpf, true);
+ if (!error) {
+ /* FIXME: should we named ovs-system? */
+ error = netdev_open("ovs-system", "tap", &outport);
+ if (!error) {
+ VLOG_INFO("%s: created BPF tap downcall device %s",
+ __func__, outport->name);
+
+ error = configure_outport(outport);
+ if (error) {
+ VLOG_ERR("%s: configure downcall device failed", __func__);
+ netdev_close(outport);
+ } else {
+ datapath.outport = outport;
+ }
+ }
+ }
+
+ if (!error) {
+ dpif_bpf_delete_all_flow();
+ }
+ ovsthread_once_done(&once);
+ }
+ return error;
+}
+
+static int
+dpif_bpf_enumerate(struct sset *all_dps,
+ const struct dpif_class *dpif_class OVS_UNUSED)
+{
+ struct shash_node *node;
+
+ ovs_mutex_lock(&bpf_datapath_mutex);
+ SHASH_FOR_EACH(node, &bpf_datapaths) {
+ sset_add(all_dps, node->name);
+ }
+ ovs_mutex_unlock(&bpf_datapath_mutex);
+
+ return 0;
+}
+
+static const char
+*dpif_bpf_port_open_type(const struct dpif_class *dpif_class OVS_UNUSED,
+ const char *type)
+{
+ return strcmp(type, "internal") ? type : "tap";
+}
+
+static struct dpif *
+create_dpif_bpf(struct dpif_bpf_dp *dp)
+ OVS_REQUIRES(bpf_datapath_mutex)
+{
+ uint16_t netflow_id = hash_string(dp->name, 0);
+ struct dpif_bpf *dpif;
+
+ ovs_refcount_ref(&dp->ref_cnt);
+
+ dpif = xmalloc(sizeof *dpif);
+ dpif_init(&dpif->dpif, &dpif_bpf_class, dp->name, netflow_id >> 8, netflow_id);
+ dpif->dp = dp;
+
+ return &dpif->dpif;
+}
+
+static int
+dpif_bpf_open(const struct dpif_class *dpif_class OVS_UNUSED,
+ const char *name, bool create OVS_UNUSED, struct dpif **dpifp)
+{
+ struct dpif_bpf_dp *dp;
+ int error;
+
+ error = dpif_bpf_init();
+ if (error) {
+ VLOG_ERR("dpif_bpf_init failed");
+ return error;
+ }
+
+ ovs_mutex_lock(&bpf_datapath_mutex);
+ dp = shash_find_data(&bpf_datapaths, name);
+ if (!dp) {
+ error = create ? create_dp_bpf(name, &dp) : ENODEV;
+ } else {
+ ovs_assert(dpif_class == &dpif_bpf_class);
+ error = create ? EEXIST : 0;
+ }
+ if (!error) {
+ *dpifp = create_dpif_bpf(dp);
+ if (create) { /* XXX */
+ dp->dpif = *dpifp;
+ }
+ }
+ ovs_mutex_unlock(&bpf_datapath_mutex);
+
+ return error;
+}
+
+static int
+perf_event_channels_init(struct dpif_bpf_dp *dp)
+{
+ size_t length = dp->page_size * (dp->n_pages + 1);
+ int error = 0;
+ int i, cpu;
+
+ for (cpu = 0; cpu < dp->n_channels; cpu++) {
+ struct perf_channel *channel = &dp->channels[cpu];
+
+ error = perf_channel_open(channel, cpu, length);
+ if (error) {
+ goto error;
+ }
+ }
+
+error:
+ if (error) {
+ for (i = 0; i < cpu; i++) {
+ perf_channel_close(&dp->channels[cpu]);
+ }
+ }
+
+ return error;
+}
+
+static void
+dpif_bpf_free(struct dpif_bpf_dp *dp)
+ OVS_REQUIRES(bpf_datapath_mutex)
+{
+ shash_find_and_delete(&bpf_datapaths, dp->name);
+
+ if (ovs_refcount_read(&dp->ref_cnt) == 0) {
+ ovs_mutex_destroy(&dp->port_mutex);
+ seq_destroy(dp->port_seq);
+ fat_rwlock_destroy(&dp->upcall_lock);
+ hmap_destroy(&dp->ports_by_ifindex);
+ hmap_destroy(&dp->ports_by_odp);
+ if (dp->n_handlers) {
+ free(dp->handlers);
+ }
+ free(dp);
+ }
+}
+
+int
+create_dp_bpf(const char *name, struct dpif_bpf_dp **dp_)
+ OVS_REQUIRES(bpf_datapath_mutex)
+{
+ int max_cpu;
+ struct dpif_bpf_dp *dp;
+ int i, error;
+
+ max_cpu = ovs_numa_get_n_cores();
+
+ dp = xzalloc(sizeof *dp + max_cpu * sizeof(struct perf_channel));
+ ovs_refcount_init(&dp->ref_cnt);
+ atomic_flag_clear(&dp->destroyed);
+ hmap_init(&dp->ports_by_odp);
+ hmap_init(&dp->ports_by_ifindex);
+ fat_rwlock_init(&dp->upcall_lock);
+ dp->port_seq = seq_create();
+ ovs_mutex_init(&dp->port_mutex);
+ dp->n_pages = 8;
+ dp->page_size = sysconf(_SC_PAGESIZE);
+ dp->n_channels = max_cpu;
+ dp->last_seq = seq_read(dp->port_seq);
+
+ *CONST_CAST(const char **, &dp->name) = xstrdup(name);
+ shash_add(&bpf_datapaths, name, dp); /* XXX */
+
+ error = perf_event_channels_init(dp);
+ if (error) {
+ dpif_bpf_free(dp);
+ return error;
+ }
+
+ ovs_assert(datapath.bpf.upcalls.fd != -1);
+
+ for (i = 0; i < dp->n_channels; i++) {
+ error = bpf_map_update_elem(datapath.bpf.upcalls.fd, &i,
+ &dp->channels[i].fd, 0);
+ if (error) {
+ VLOG_WARN("failed to insert channel fd on cpu=%d: %s",
+ i, ovs_strerror(error));
+ goto out;
+ }
+ }
+
+out:
+ if (error) {
+ dpif_bpf_free(dp);
+ }
+ if (!error) {
+ *dp_ = dp;
+ }
+ return 0;
+}
+
+static void
+dpif_bpf_close(struct dpif *dpif_)
+{
+ struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+
+ ovs_mutex_lock(&bpf_datapath_mutex);
+ if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
+ struct dpif_bpf_port *port, *next;
+ int i;
+
+ fat_rwlock_wrlock(&dp->upcall_lock);
+ for (i = 0; i < dp->n_channels; i++) {
+ struct perf_channel *channel = &dp->channels[i];
+
+ perf_channel_close(channel);
+ }
+ fat_rwlock_unlock(&dp->upcall_lock);
+
+ ovs_mutex_lock(&dp->port_mutex);
+ HMAP_FOR_EACH_SAFE (port, next, odp_node, &dp->ports_by_odp) {
+ do_del_port(dp, port);
+ }
+ ovs_mutex_unlock(&dp->port_mutex);
+ dpif_bpf_free(dp);
+ }
+ ovs_mutex_unlock(&bpf_datapath_mutex);
+
+ free(dpif_bpf_cast(dpif_));
+}
+
+static int
+dpif_bpf_destroy(struct dpif *dpif_)
+{
+ struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+
+ if (!atomic_flag_test_and_set(&dp->destroyed)) {
+ if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
+ /* Can't happen: 'dpif' still owns a reference to 'dp'.
+ * The workflow is first call dpif_class->destroy() then
+ * dpif->close(). */
+ OVS_NOT_REACHED();
+ }
+ }
+#if 0
+ if (datapath.outport) {
+ netdev_close(datapath.outport);
+ }
+#endif
+
+ return 0;
+}
+
+static int
+dpif_bpf_get_stats(const struct dpif *dpif OVS_UNUSED,
+ struct dpif_dp_stats *stats)
+{
+ uint32_t key, n_flows = 0;
+ struct bpf_flow_key flow_key;
+ int err = 0;
+
+ memset(stats, 0, sizeof(*stats));
+ key = OVS_DP_STATS_HIT;
+ if (bpf_map_lookup_elem(datapath.bpf.datapath_stats.fd, &key,
+ &stats->n_hit)) {
+ VLOG_INFO("datapath_stats lookup failed (%d): %s", key,
+ ovs_strerror(errno));
+ }
+ key = OVS_DP_STATS_MISSED;
+ if (bpf_map_lookup_elem(datapath.bpf.datapath_stats.fd, &key,
+ &stats->n_missed)) {
+ VLOG_INFO("datapath_stats lookup failed (%d): %s", key,
+ ovs_strerror(errno));
+ }
+
+ /* Count the number of datapath flow entries */
+ memset(&flow_key, 0, sizeof flow_key);
+ do {
+ err = bpf_map_get_next_key(datapath.bpf.flow_table.fd,
+ &flow_key, &flow_key);
+ if (!err) {
+ n_flows++;
+ }
+ } while (!err);
+
+ stats->n_flows = n_flows;
+
+ /* XXX: Other missing stats */
+ return 0;
+}
+
+static struct dpif_bpf_port *
+bpf_lookup_port(const struct dpif_bpf_dp *dp, odp_port_t port_no)
+ OVS_REQUIRES(dp->port_mutex)
+{
+ struct dpif_bpf_port *port;
+
+ HMAP_FOR_EACH_WITH_HASH (port, odp_node, netdev_hash_port_no(port_no),
+ &dp->ports_by_odp) {
+ if (port->port_no == port_no) {
+ return port;
+ }
+ }
+ return NULL;
+}
+
+static odp_port_t
+choose_port(struct dpif_bpf_dp *dp)
+ OVS_REQUIRES(dp->port_mutex)
+{
+ uint32_t port_no;
+
+ for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
+ if (!bpf_lookup_port(dp, u32_to_odp(port_no))) {
+ return u32_to_odp(port_no);
+ }
+ }
+
+ return ODPP_NONE;
+}
+
+static int
+get_port_by_name(struct dpif_bpf_dp *dp, const char *devname,
+ struct dpif_bpf_port **portp)
+ OVS_REQUIRES(dp->port_mutex)
+{
+ struct dpif_bpf_port *port;
+
+ HMAP_FOR_EACH (port, odp_node, &dp->ports_by_odp) {
+ if (!strcmp(netdev_get_name(port->netdev), devname)) {
+ *portp = port;
+ return 0;
+ }
+ }
+
+ *portp = NULL;
+ return ENOENT;
+}
+
+static uint32_t
+hash_ifindex(int ifindex)
+{
+ return hash_int(ifindex, 0);
+}
+
+static int
+get_port_by_ifindex(struct dpif_bpf_dp *dp, int ifindex,
+ struct dpif_bpf_port **portp)
+ OVS_REQUIRES(dp->port_mutex)
+{
+ struct dpif_bpf_port *port;
+
+ HMAP_FOR_EACH_WITH_HASH (port, if_node, hash_ifindex(ifindex),
+ &dp->ports_by_ifindex) {
+ if (port->ifindex == ifindex) {
+ *portp = port;
+ return 0;
+ }
+ }
+
+ *portp = NULL;
+ return ENOENT;
+}
+
+static odp_port_t
+ifindex_to_odp(struct dpif_bpf_dp *dp, int ifindex)
+ OVS_REQUIRES(dp->port_mutex)
+{
+ struct dpif_bpf_port *port;
+
+ if (get_port_by_ifindex(dp, ifindex, &port)) {
+ return ODPP_NONE;
+ }
+
+ return port->port_no;
+}
+
+static bool output_to_local_stack(struct netdev *netdev)
+{
+ return !strcmp(netdev_get_type(netdev), "tap");
+}
+
+static bool netdev_support_xdp(struct netdev *netdev OVS_UNUSED)
+{
+ return true;
+}
+
+static uint32_t
+get_port_flags(struct netdev *netdev)
+{
+ return output_to_local_stack(netdev) ? OVS_BPF_FLAGS_TX_STACK : 0;
+}
+
+static uint16_t
+odp_port_to_ifindex(struct dpif_bpf_dp *dp, odp_port_t port_no, uint32_t *flags)
+ OVS_REQUIRES(dp->port_mutex)
+{
+ struct dpif_bpf_port *port = bpf_lookup_port(dp, port_no);
+
+ if (port) {
+ if (flags) {
+ *flags = get_port_flags(port->netdev);
+ }
+ return port->ifindex;
+ }
+ return 0;
+}
+
+/* Modelled after dpif-netdev 'port_create', minus pmd and txq logic, plus bpf
+ * filter set. */
+static int
+port_create(const char *devname, const char *type,
+ odp_port_t port_no, struct dpif_bpf_port **portp)
+{
+ struct netdev_saved_flags *sf;
+ struct dpif_bpf_port *port;
+ enum netdev_flags flags;
+ struct netdev *netdev;
+ int n_open_rxqs = 0;
+ int i, error;
+ int ifindex;
+
+ *portp = NULL;
+
+ /* Open and validate network device. */
+ error = netdev_open(devname, type, &netdev);
+
+ VLOG_DBG("%s %s type %s error %d", __func__, devname, type, error);
+ if (error) {
+ return error;
+ }
+ /* XXX reject non-Ethernet devices */
+
+ netdev_get_flags(netdev, &flags);
+ if (flags & NETDEV_LOOPBACK) {
+ VLOG_ERR_RL(&rl, "%s: cannot add a loopback device", devname);
+ error = EINVAL;
+ goto out;
+ }
+
+ if (netdev_is_reconf_required(netdev)) {
+ error = netdev_reconfigure(netdev);
+ if (error) {
+ goto out;
+ }
+ }
+
+ ifindex = netdev_get_ifindex(netdev);
+ if (ifindex < 0) {
+ VLOG_WARN_RL(&rl, "%s: Failed to get ifindex", devname);
+ error = -ifindex;
+ goto out;
+ }
+
+ VLOG_DBG("%s ifindex = %d", devname, ifindex);
+
+ /* For all internal port, ex: br0, br-underlay, br-int,
+ we set bpf program only to its egress queue. (due to the
+ natural of tap device). For other types, ex: eth0, vxlan_sys,
+ we set bpf program to its ingress queue.
+
+ A tap device's egress queue is tied to a socket for userspace
+ to receive the packet by open(/dev/tun0). On the other hand,
+ a send to the socket will show up in the tap device's ingress queue.
+ */
+ if (output_to_local_stack(netdev)) {
+ error = netdev_set_filter(netdev, &datapath.bpf.egress);
+ } else {
+ error = netdev_set_filter(netdev, &datapath.bpf.ingress);
+ }
+ if (error) {
+ goto out;
+ }
+
+ if (netdev_support_xdp(netdev)) {
+ error = netdev_set_xdp(netdev, &datapath.bpf.xdp);
+ if (error) {
+ VLOG_WARN("%s XDP set failed", __func__);
+ goto out;
+ }
+ VLOG_DBG("%s %s XDP set done", __func__, netdev->name);
+ }
+
+ port = xzalloc(sizeof *port);
+ port->port_no = port_no;
+ port->ifindex = ifindex;
+ port->netdev = netdev;
+ port->n_rxq = netdev_n_rxq(netdev);
+ port->rxqs = xcalloc(port->n_rxq, sizeof *port->rxqs);
+ port->type = xstrdup(type);
+
+ for (i = 0; i < port->n_rxq; i++) {
+ error = netdev_rxq_open(netdev, &port->rxqs[i], i);
+ if (error) {
+ VLOG_ERR("%s: cannot receive packets on this network device (queue %d) (%s)",
+ devname, i, ovs_strerror(errno));
+ goto out_rxq_close;
+ }
+ n_open_rxqs++;
+ }
+
+ error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
+ if (error) {
+ goto out_rxq_close;
+ }
+ port->sf = sf;
+
+ *portp = port;
+ return 0;
+
+out_rxq_close:
+ for (i = 0; i < n_open_rxqs; i++) {
+ netdev_rxq_close(port->rxqs[i]);
+ }
+ free(port->type);
+ free(port->rxqs);
+ free(port);
+
+out:
+ netdev_close(netdev);
+ return error;
+}
+
+static int
+do_add_port(struct dpif_bpf_dp *dp, const char *devname,
+ const char *type, odp_port_t port_no)
+ OVS_REQUIRES(dp->port_mutex)
+{
+ struct dpif_bpf_port *port;
+ int error;
+
+ if (!get_port_by_name(dp, devname, &port)) {
+ return EEXIST;
+ }
+
+ error = port_create(devname, type, port_no, &port);
+ if (error) {
+ VLOG_ERR("port_create return %d", error);
+ return error;
+ }
+
+ hmap_insert(&dp->ports_by_odp, &port->odp_node,
+ netdev_hash_port_no(port->port_no));
+ hmap_insert(&dp->ports_by_ifindex, &port->if_node,
+ hash_ifindex(port->ifindex));
+ seq_change(dp->port_seq);
+
+ return 0;
+}
+
+static int
+dpif_bpf_port_add(struct dpif *dpif, struct netdev *netdev,
+ odp_port_t *port_nop)
+{
+ struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif);
+ char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
+ const char *dpif_port;
+ odp_port_t port_no;
+ int error;
+
+ if (!strcmp(netdev_get_type(netdev), "vxlan") ||
+ !strcmp(netdev_get_type(netdev), "gre") ||
+ !strcmp(netdev_get_type(netdev), "geneve")) {
+
+ VLOG_INFO("Creating %s device", netdev_get_type(netdev));
+ error = dpif_netlink_rtnl_port_create(netdev);
+ if (error) {
+ if (error != EOPNOTSUPP) {
+ VLOG_WARN_RL(&rl, "Failed to create %s with rtnetlink: %s",
+ netdev_get_name(netdev), ovs_strerror(error));
+ }
+ return error;
+ }
+ }
+
+ ovs_mutex_lock(&dp->port_mutex);
+ dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
+ if (*port_nop != ODPP_NONE) {
+ port_no = *port_nop;
+ error = bpf_lookup_port(dp, *port_nop) ? EBUSY : 0;
+ } else {
+ port_no = choose_port(dp);
+ error = port_no == ODPP_NONE ? EFBIG : 0;
+ }
+ if (error) {
+ goto unlock;
+ }
+
+ *port_nop = port_no;
+ error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
+ if (error) {
+ goto unlock;
+ }
+
+unlock:
+ ovs_mutex_unlock(&dp->port_mutex);
+ return error;
+}
+
+static void
+do_del_port(struct dpif_bpf_dp *dp, struct dpif_bpf_port *port)
+ OVS_REQUIRES(dp->port_mutex)
+{
+ int i, error;
+
+ seq_change(dp->port_seq);
+ hmap_remove(&dp->ports_by_odp, &port->odp_node);
+ hmap_remove(&dp->ports_by_ifindex, &port->if_node);
+
+ error = netdev_set_filter(port->netdev, NULL);
+ if (error) {
+ VLOG_WARN("%s: Failed to clear filter from netdev",
+ netdev_get_name(port->netdev));
+ }
+
+ if (netdev_support_xdp(port->netdev)) {
+ error = netdev_set_xdp(port->netdev, NULL);
+ if (error) {
+ VLOG_WARN("%s: Failed to clear XDP from netdev",
+ netdev_get_name(port->netdev));
+ }
+ }
+
+ netdev_close(port->netdev);
+ netdev_restore_flags(port->sf);
+ for (i = 0; i < port->n_rxq; i++) {
+ netdev_rxq_close(port->rxqs[i]);
+ }
+
+ free(port->type);
+ free(port->rxqs);
+ free(port);
+}
+
+static int
+dpif_bpf_port_del(struct dpif *dpif, odp_port_t port_no)
+{
+ struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif);
+ struct dpif_bpf_port *port;
+ int error = 0;
+
+ ovs_mutex_lock(&dp->port_mutex);
+ port = bpf_lookup_port(dp, port_no);
+ if (!port) {
+ VLOG_WARN("deleting port %d, but it doesn't exist", port_no);
+ error = EINVAL;
+ }
+ ovs_mutex_unlock(&dp->port_mutex);
+
+ return error;
+}
+
+static void
+answer_port_query(const struct dpif_bpf_port *port,
+ struct dpif_port *dpif_port)
+{
+ dpif_port->name = xstrdup(netdev_get_name(port->netdev));
+ dpif_port->type = xstrdup(port->type);
+ dpif_port->port_no = port->port_no;
+}
+
+static int
+dpif_bpf_port_query_by_number(const struct dpif *dpif_, odp_port_t port_no,
+ struct dpif_port *port_)
+{
+ struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+ struct dpif_bpf_port *port;
+ int error = 0;
+
+ ovs_mutex_lock(&dp->port_mutex);
+ port = bpf_lookup_port(dp, port_no);
+ if (!port) {
+ errno = ENOENT;
+ goto out;
+ }
+ answer_port_query(port, port_);
+
+out:
+ ovs_mutex_unlock(&dp->port_mutex);
+ return error;
+}
+
+static int
+dpif_bpf_port_query_by_name(const struct dpif *dpif_, const char *devname,
+ struct dpif_port *dpif_port)
+{
+ struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+ struct dpif_bpf_port *port;
+ int error;
+
+ ovs_mutex_lock(&dp->port_mutex);
+ error = get_port_by_name(dp, devname, &port);
+ if (!error && dpif_port) {
+ answer_port_query(port, dpif_port);
+ }
+ ovs_mutex_unlock(&dp->port_mutex);
+
+ return error;
+}
+
+struct dpif_bpf_port_state {
+ struct hmap_position position;
+ char *name;
+};
+
+static int
+dpif_bpf_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
+{
+ *statep = xzalloc(sizeof(struct dpif_bpf_port_state));
+ return 0;
+}
+
+static int
+dpif_bpf_port_dump_next(const struct dpif *dpif_, void *state_,
+ struct dpif_port *dpif_port)
+{
+ struct dpif_bpf_port_state *state = state_;
+ struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+ struct hmap_node *node;
+ int retval;
+
+ ovs_mutex_lock(&dp->port_mutex);
+ node = hmap_at_position(&dp->ports_by_odp, &state->position);
+ if (node) {
+ struct dpif_bpf_port *port;
+
+ port = CONTAINER_OF(node, struct dpif_bpf_port, odp_node);
+
+ free(state->name);
+ state->name = xstrdup(netdev_get_name(port->netdev));
+ dpif_port->name = state->name;
+ dpif_port->type = port->type;
+ dpif_port->port_no = port->port_no;
+
+ retval = 0;
+ } else {
+ retval = EOF;
+ }
+ ovs_mutex_unlock(&dp->port_mutex);
+
+ return retval;
+}
+
+static int
+dpif_bpf_port_dump_done(const struct dpif *dpif OVS_UNUSED,
+ void *state_)
+{
+ struct dpif_bpf_port_state *state = state_;
+
+ free(state->name);
+ free(state);
+ return 0;
+}
+
+static int
+dpif_bpf_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
+{
+ struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+ uint64_t new_port_seq;
+
+ new_port_seq = seq_read(dp->port_seq);
+ if (dp->last_seq != new_port_seq) {
+ dp->last_seq = new_port_seq;
+ return ENOBUFS;
+ }
+
+ return EAGAIN;
+}
+
+static void
+dpif_bpf_port_poll_wait(const struct dpif *dpif_)
+{
+ struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+
+ seq_wait(dp->port_seq, dp->last_seq);
+}
+
+static int
+dpif_bpf_flow_flush(struct dpif *dpif OVS_UNUSED)
+{
+ struct bpf_flow_key key;
+ int err = 0;
+
+ /* Flow Entry Table */
+ memset(&key, 0, sizeof key);
+ do {
+ err = bpf_map_get_next_key(datapath.bpf.flow_table.fd, &key, &key);
+ if (!err) {
+ bpf_map_delete_elem(datapath.bpf.flow_table.fd, &key);
+ }
+ } while (!err);
+
+ /* Flow Stats Table */
+ memset(&key, 0, sizeof key);
+ do {
+ err = bpf_map_get_next_key(datapath.bpf.dp_flow_stats.fd, &key, &key);
+ if (!err) {
+ bpf_map_delete_elem(datapath.bpf.dp_flow_stats.fd, &key);
+ }
+ } while (!err);
+
+
+ return errno == ENOENT ? 0 : errno;
+}
+
+struct dpif_bpf_flow_dump {
+ struct dpif_flow_dump up;
+ int status;
+ struct bpf_flow_key pos;
+ struct ovs_mutex mutex;
+};
+
+static struct dpif_bpf_flow_dump *
+dpif_bpf_flow_dump_cast(struct dpif_flow_dump *dump)
+{
+ return CONTAINER_OF(dump, struct dpif_bpf_flow_dump, up);
+}
+
+static struct dpif_flow_dump *
+dpif_bpf_flow_dump_create(const struct dpif *dpif_, bool terse,
+ char *type OVS_UNUSED)
+{
+ struct dpif_bpf_flow_dump *dump;
+
+ dump = xzalloc(sizeof *dump);
+ dpif_flow_dump_init(&dump->up, dpif_);
+ dump->up.terse = terse;
+ ovs_mutex_init(&dump->mutex);
+
+ return &dump->up;
+}
+
+static int
+dpif_bpf_flow_dump_destroy(struct dpif_flow_dump *dump_)
+{
+ struct dpif_bpf_flow_dump *dump = dpif_bpf_flow_dump_cast(dump_);
+ int status = dump->status;
+
+ ovs_mutex_destroy(&dump->mutex);
+ free(dump);
+
+ return status == ENOENT ? 0 : status;
+}
+
+struct dpif_bpf_flow_dump_thread {
+ struct dpif_flow_dump_thread up;
+ struct dpif_bpf_flow_dump *dump;
+ struct ofpbuf buf; /* Stores key,mask,acts for a particular dump. */
+};
+
+static struct dpif_bpf_flow_dump_thread *
+dpif_bpf_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
+{
+ return CONTAINER_OF(thread, struct dpif_bpf_flow_dump_thread, up);
+}
+
+static struct dpif_flow_dump_thread *
+dpif_bpf_flow_dump_thread_create(struct dpif_flow_dump *dump_)
+{
+ struct dpif_bpf_flow_dump *dump = dpif_bpf_flow_dump_cast(dump_);
+ struct dpif_bpf_flow_dump_thread *thread;
+
+ thread = xmalloc(sizeof *thread);
+ dpif_flow_dump_thread_init(&thread->up, &dump->up);
+ thread->dump = dump;
+ ofpbuf_init(&thread->buf, 1024);
+ return &thread->up;
+}
+
+static void
+dpif_bpf_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
+{
+ struct dpif_bpf_flow_dump_thread *thread =
+ dpif_bpf_flow_dump_thread_cast(thread_);
+ ofpbuf_uninit(&thread->buf);
+ free(thread);
+}
+
+static int
+fetch_flow(struct dpif_bpf_dp *dp, struct dpif_flow *flow,
+ struct ofpbuf *out, const struct bpf_flow_key *key)
+{
+ struct flow f;
+ struct odp_flow_key_parms parms = {
+ .flow = &f,
+ };
+ struct bpf_action_batch action;
+ struct bpf_flow_stats stats;
+ int err;
+
+ memset(flow, 0, sizeof *flow);
+
+ err = bpf_map_lookup_elem(datapath.bpf.flow_table.fd, key, &action);
+ if (err) {
+ return errno;
+ }
+
+ /* XXX: Extract 'dp_flow' into 'flow'. */
+ if (bpf_flow_key_to_flow(key, &f) == ODP_FIT_ERROR) {
+ VLOG_WARN("%s: bpf flow key parsing error", __func__);
+ return EINVAL;
+ }
+ f.in_port.odp_port = ifindex_to_odp(dp,
+ odp_to_u32(f.in_port.odp_port));
+
+ /* Translate BPF flow into netlink format. */
+ ofpbuf_clear(out);
+
+ /* Use 'out->header' to point to the flow key, 'out->msg' for actions */
+ out->header = out->data;
+ odp_flow_key_from_flow(&parms, out);
+ out->msg = ofpbuf_tail(out);
+ err = bpf_actions_to_odp_actions(&action, out);
+ if (err) {
+ VLOG_ERR("%s: bpf_actions to odp actions fails", __func__);
+ return err;
+ }
+
+ flow->key = out->header;
+ flow->key_len = ofpbuf_headersize(out);
+ flow->actions = out->msg;
+ flow->actions_len = ofpbuf_msgsize(out);
+
+ dpif_flow_hash(dp->dpif, flow->key, flow->key_len, &flow->ufid);
+ flow->ufid_present = false; /* XXX */
+
+ /* Fetch datapath flow stats */
+ err = bpf_map_lookup_elem(datapath.bpf.dp_flow_stats.fd, key, &stats);
+ if (err) {
+ VLOG_DBG("flow stats lookup fails, fd %d err = %d %s",
+ datapath.bpf.dp_flow_stats.fd, err, ovs_strerror(errno));
+ return errno;
+ } else {
+ VLOG_DBG("flow stats lookup OK");
+ memcpy(&flow->stats, &stats, 3 * sizeof(uint64_t));
+ }
+
+ return 0;
+}
+
+static int
+dpif_bpf_insert_flow(struct bpf_flow_key *flow_key,
+ struct bpf_action_batch *actions)
+{
+ int err;
+ struct bpf_flow_stats flow_stats;
+
+ VLOG_DBG("Insert bof_flow_key:");
+ vlog_hex_dump((unsigned char *)flow_key, sizeof *flow_key);
+
+ VLOG_DBG("Insert action:");
+ vlog_hex_dump((unsigned char *)actions, sizeof actions[0]);
+
+ ovs_assert(datapath.bpf.flow_table.fd != -1);
+ err = bpf_map_update_elem(datapath.bpf.flow_table.fd,
+ flow_key,
+ actions, BPF_ANY);
+ if (err) {
+ VLOG_ERR("Failed to add flow into flow table, map fd %d, error %s",
+ datapath.bpf.flow_table.fd, ovs_strerror(errno));
+ return errno;
+ }
+
+ flow_stats.packet_count = 1;
+ flow_stats.byte_count = flow_key->mds.md.packet_length;
+ flow_stats.used = 0;
+
+ err = bpf_map_update_elem(datapath.bpf.dp_flow_stats.fd,
+ flow_key,
+ &flow_stats, BPF_ANY);
+ if (err) {
+ VLOG_ERR("Failed to add flow into flow stats table, map fd %d, error %s",
+ datapath.bpf.dp_flow_stats.fd, ovs_strerror(errno));
+ return errno;
+ }
+
+ return 0;
+}
+
+static int
+dpif_bpf_delete_flow(struct bpf_flow_key *flow_key,
+ struct dpif_flow_stats *stats)
+{
+ int err;
+ struct bpf_action_batch actions;
+
+ ovs_assert(datapath.bpf.flow_table.fd != -1);
+
+ err = bpf_map_lookup_elem(datapath.bpf.flow_table.fd, flow_key, &actions);
+ if (err != 0) {
+ VLOG_ERR("Failed to find flow into flow table, map fd %d: %s",
+ datapath.bpf.flow_table.fd, ovs_strerror(errno));
+ VLOG_WARN("bpf_flow_key not found\n");
+ vlog_hex_dump((unsigned char *)flow_key, sizeof *flow_key);
+
+ goto delete_stats;
+ }
+
+ err = bpf_map_delete_elem(datapath.bpf.flow_table.fd, flow_key);
+ if (err) {
+ VLOG_ERR("Failed to del flow into flow table, map fd %d: %s",
+ datapath.bpf.flow_table.fd, ovs_strerror(errno));
+ return errno;
+ }
+
+ if (stats) {
+ /* XXX: Stats */
+ memset(stats, 0, sizeof *stats);
+
+delete_stats:
+ err = bpf_map_delete_elem(datapath.bpf.dp_flow_stats.fd, flow_key);
+ if (err) {
+ VLOG_ERR("Failed to del flow into flow stat table, map fd %d: %s",
+ datapath.bpf.flow_table.fd, ovs_strerror(errno));
+ /* Skip when element is not found */
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static int
+dpif_bpf_delete_all_flow(void)
+{
+ int err;
+ struct bpf_flow_key key;
+
+ do {
+ err = bpf_map_get_next_key(datapath.bpf.flow_table.fd, NULL, &key);
+ if (err) {
+ return err;
+ }
+
+ err = bpf_map_delete_elem(datapath.bpf.flow_table.fd, &key);
+ } while (!err);
+
+ return err;
+}
+
+static int
+dpif_bpf_flow_dump_next(struct dpif_flow_dump_thread *thread_,
+ struct dpif_flow *flows, int max_flows)
+{
+ struct dpif_bpf_flow_dump_thread *thread =
+ dpif_bpf_flow_dump_thread_cast(thread_);
+ struct dpif_bpf_flow_dump *dump = thread->dump;
+ int n = 0;
+ int err;
+
+ ovs_mutex_lock(&dump->mutex);
+ err = dump->status;
+ if (err) {
+ goto unlock;
+ }
+
+ while (n <= max_flows) {
+ struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dump->up.dpif);
+
+ err = bpf_map_get_next_key(datapath.bpf.flow_table.fd,
+ &dump->pos, &dump->pos);
+ if (err) {
+ err = errno;
+ break;
+ }
+ err = fetch_flow(dp, &flows[n], &thread->buf, &dump->pos);
+ if (err == ENOENT) {
+ /* Flow disappeared. Oh well, we tried. */
+ continue;
+ } else if (err) {
+ break;
+ }
+ n++;
+ }
+ dump->status = err;
+unlock:
+ ovs_mutex_unlock(&dump->mutex);
+ return n;
+}
+
+struct dpif_bpf_downcall_parms {
+ uint32_t type;
+ odp_port_t port_no;
+ struct bpf_action_batch *action_batch;
+};
+
+static int
+dpif_bpf_downcall(struct dpif *dpif_, struct dp_packet *packet,
+ const struct flow *flow,
+ struct dpif_bpf_downcall_parms *parms)
+{
+ struct dp_packet_batch batch;
+ struct bpf_downcall md = {
+ .type = parms->type,
+ .debug = 0xC0FFEEEE,
+ };
+ uint32_t ifindex;
+ uint32_t flags;
+ int error;
+ int queue = 0;
+ struct dp_packet *clone_pkt;
+
+ ovs_assert(datapath.bpf.execute_actions.fd != -1);
+
+ bpf_metadata_from_flow(flow, &md.md);
+
+ ifindex = odp_port_to_ifindex(get_dpif_bpf_dp(dpif_),
+ flow->in_port.odp_port, &flags);
+#if 0
+ /* this is ok at check_support time */
+ if (!ifindex) {
+ VLOG_WARN("%s: in_port.odp_port %d found",
+ __func__, flow->in_port.odp_port);
+ return ENODEV;
+ }
+#endif
+
+ md.md.md.in_port = ifindex;
+ md.ifindex = ifindex;
+
+ if (parms->action_batch) {
+ int zero_index = 0;
+ error = bpf_map_update_elem(datapath.bpf.execute_actions.fd,
+ &zero_index, parms->action_batch, 0);
+ if (error) {
+ VLOG_ERR("%s: map update failed", __func__);
+ return error;
+ }
+ }
+
+ /* XXX: Check that ovs-system device MTU is large enough to include md. */
+ dp_packet_put(packet, &md, sizeof md);
+ clone_pkt = dp_packet_clone(packet);
+ dp_packet_batch_init_packet(&batch, clone_pkt);
+
+ VLOG_INFO("send downcall (%d)", parms->type);
+ error = netdev_send(datapath.outport, queue, &batch, false);
+ dp_packet_set_size(packet, dp_packet_size(packet) - sizeof md);
+
+ return error;
+}
+
+static int OVS_UNUSED
+dpif_bpf_output(struct dpif *dpif_, struct dp_packet *packet,
+ const struct flow *flow, odp_port_t port_no,
+ uint32_t flags OVS_UNUSED)
+{
+ struct dpif_bpf_downcall_parms parms = {
+ .port_no = port_no,
+ .type = OVS_BPF_DOWNCALL_OUTPUT,
+ .action_batch = NULL
+ };
+ return dpif_bpf_downcall(dpif_, packet, flow, &parms);
+}
+
+static int
+dpif_bpf_execute_(struct dpif *dpif_, struct dp_packet *packet,
+ const struct flow *flow,
+ struct bpf_action_batch *action_batch)
+{
+ struct dpif_bpf_downcall_parms parms = {
+ .type = OVS_BPF_DOWNCALL_EXECUTE,
+ .action_batch = action_batch,
+ };
+ return dpif_bpf_downcall(dpif_, packet, flow, &parms);
+}
+
+static int
+dpif_bpf_serialize_actions(struct dpif_bpf_dp *dp,
+ struct bpf_action_batch *action_batch,
+ const struct nlattr *nlactions,
+ size_t actions_len)
+{
+
+ const struct nlattr *a;
+ unsigned int left, count = 0, skipped = 0;
+ struct bpf_action *actions;
+
+ memset(action_batch, 0, sizeof(*action_batch));
+ actions = action_batch->actions;
+
+ NL_ATTR_FOR_EACH_UNSAFE (a, left, nlactions, actions_len) {
+ enum ovs_action_attr type = nl_attr_type(a);
+ actions[count].type = type;
+
+ if (type == OVS_ACTION_ATTR_OUTPUT) {
+ struct dpif_bpf_port *port;
+ odp_port_t port_no = nl_attr_get_odp_port(a);
+
+ ovs_mutex_lock(&dp->port_mutex);
+ port = bpf_lookup_port(dp, port_no);
+ if (port) {
+ VLOG_INFO("output action to port %d ifindex %d", port_no,
+ port->ifindex);
+ actions[count].u.out.port = port->ifindex;
+ actions[count].u.out.flags = get_port_flags(port->netdev);
+ }
+ ovs_mutex_unlock(&dp->port_mutex);
+ } else {
+ if (odp_action_to_bpf_action(a, &actions[count])) {
+ skipped++;
+ }
+ }
+ count++;
+ }
+
+ VLOG_INFO("Processing flow actions (%d/%d skipped)", skipped, count);
+ if (skipped) {
+ /* XXX: VLOG actions that couldn't be processed */
+ }
+ return 0;
+}
+
+static int
+dpif_bpf_execute(struct dpif *dpif_, struct dpif_execute *execute)
+{
+ struct bpf_action_batch batch;
+ int error = 0;
+
+ error = dpif_bpf_serialize_actions(get_dpif_bpf_dp(dpif_), &batch, execute->actions,
+ execute->actions_len);
+ if (error) {
+ return error;
+ }
+
+ error = dpif_bpf_execute_(dpif_, execute->packet,
+ execute->flow, &batch);
+ return error;
+}
+
+/* Translates 'port' into an ifindex and sets it inside 'key'.
+ *
+ * Returns 0 on success, or a positive errno otherwise. */
+static int
+set_in_port(struct dpif_bpf_dp *dp, struct bpf_flow_key *key, odp_port_t port)
+{
+ uint16_t ifindex;
+
+ ifindex = odp_port_to_ifindex(dp, port, NULL);
+ if (!ifindex && port) {
+ VLOG_WARN("Could not find ifindex corresponding to port %"PRIu32,
+ port);
+ return ENODEV;
+ }
+
+ key->mds.md.in_port = ifindex;
+ return 0;
+}
+
+/* Converts 'key' (of size 'key_len') into a bpf flow key in 'key_out', and
+ * optionally 'actions' (of size 'actions_len') into 'batch'. 'mask' (of size
+ * 'mask_len') may optionally be used for logging, of which the verbosity is
+ * controlled by 'verbose'.
+ *
+ * Returns 0 on success, or a positive errno otherwise.
+ */
+static int
+prepare_bpf_flow__(struct dpif_bpf_dp *dp,
+ const struct nlattr *key, size_t key_len,
+ const struct nlattr *mask, size_t mask_len,
+ const struct nlattr *actions, size_t actions_len,
+ struct bpf_flow_key *key_out, struct bpf_action_batch *batch,
+ bool verbose)
+{
+ odp_port_t in_port;
+ int err = EINVAL;
+
+ if (1) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+
+ /* XXX: Use dpif_format_flow()? */
+ odp_flow_format(key, key_len, mask, mask_len, NULL, &ds, true);
+ ds_put_cstr(&ds, ", actions=");
+ format_odp_actions(&ds, actions, actions_len, NULL);
+ VLOG_WARN("Translating odp key to bpf key:\n%s", ds_cstr(&ds));
+ ds_destroy(&ds);
+ }
+
+ memset(key_out, 0, sizeof *key_out);
+ if (odp_key_to_bpf_flow_key(key, key_len, key_out,
+ &in_port, false, verbose)) {
+ if (verbose) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+
+ /* XXX: Use dpif_format_flow()? */
+ odp_flow_format(key, key_len, mask, mask_len, NULL, &ds,
+ true);
+ VLOG_WARN("Failed to translate odp key to bpf key:\n%s",
+ ds_cstr(&ds));
+ ds_destroy(&ds);
+ }
+ return err;
+ }
+
+ err = set_in_port(dp, key_out, in_port);
+ if (err) {
+ return err;
+ }
+ if (batch) {
+ err = dpif_bpf_serialize_actions(dp, batch, actions, actions_len);
+ if (err) {
+ return err;
+ }
+ }
+
+ /* Transfer back to flow to check if everything is good */
+ if (1) {
+ struct flow flow;
+ enum odp_key_fitness res;
+
+ res = bpf_flow_key_to_flow(key_out, &flow);
+ if (res != ODP_FIT_PERFECT) {
+ VLOG_ERR("transfer bpf key back to flow failed");
+ } else {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+
+ flow_format(&ds, &flow, NULL);
+ ds_put_cstr(&ds, ", actions=");
+ format_odp_actions(&ds, actions, actions_len, NULL);
+ VLOG_WARN("Translating back:\n%s", ds_cstr(&ds));
+ ds_destroy(&ds);
+ }
+ }
+
+ return 0;
+}
+
+static int
+prepare_bpf_flow(struct dpif_bpf_dp *dp, const struct nlattr *key,
+ size_t key_len, struct bpf_flow_key *key_out, bool verbose)
+{
+ return prepare_bpf_flow__(dp, key, key_len, NULL, 0, NULL, 0, key_out,
+ NULL, verbose);
+}
+
+static void
+dpif_bpf_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops)
+{
+ struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+
+ for (int i = 0; i < n_ops; i++) {
+ struct dpif_op *op = ops[i];
+ struct dpif_flow_del *del OVS_UNUSED;
+ struct dpif_flow_get *get OVS_UNUSED;
+
+ switch (op->type) {
+ case DPIF_OP_EXECUTE:
+ op->error = dpif_bpf_execute(dpif_, &op->u.execute);
+ break;
+ case DPIF_OP_FLOW_PUT: {
+ struct dpif_flow_put *put = &op->u.flow_put;
+ bool verbose = !(put->flags & DPIF_FP_PROBE);
+ struct bpf_action_batch action_batch;
+ struct bpf_flow_key key;
+ int err;
+
+ err = prepare_bpf_flow__(dp, put->key, put->key_len,
+ put->mask, put->mask_len,
+ put->actions, put->actions_len,
+ &key, &action_batch, verbose);
+ if (!err) {
+ err = dpif_bpf_insert_flow(&key, &action_batch);
+ }
+ op->error = err;
+ break;
+ }
+ case DPIF_OP_FLOW_GET: {
+ struct dpif_flow_get *get = &op->u.flow_get;
+ struct bpf_flow_key key;
+ int err;
+
+ err = prepare_bpf_flow(dp, get->key, get->key_len, &key, true);
+ if (!err) {
+ err = fetch_flow(dp, get->flow, get->buffer, &key);
+ }
+ op->error = err;
+ break;
+ }
+ case DPIF_OP_FLOW_DEL: {
+ struct dpif_flow_del *del = &op->u.flow_del;
+ struct bpf_flow_key key;
+ int err;
+
+ err = prepare_bpf_flow(dp, del->key, del->key_len, &key, true);
+ if (!err) {
+ err = dpif_bpf_delete_flow(&key, del->stats);
+ }
+ op->error = err;
+ break;
+ }
+ default:
+ OVS_NOT_REACHED();
+ }
+ }
+}
+
+static int
+dpif_bpf_recv_set(struct dpif *dpif_, bool enable)
+{
+ struct dpif_bpf_dp *dpif = get_dpif_bpf_dp(dpif_);
+ int stored_error = 0;
+
+ for (int i = 0; i < dpif->n_channels; i++) {
+ int error = perf_channel_set(&dpif->channels[i], enable);
+ if (error) {
+ VLOG_ERR("failed to set recv_set %s (%s)",
+ enable ? "true": "false", ovs_strerror(error));
+ stored_error = error;
+ }
+ }
+
+ return stored_error;
+}
+
+static int
+dpif_bpf_handlers_set__(struct dpif_bpf_dp *dp, uint32_t n_handlers)
+ OVS_REQUIRES(&dp->upcall_lock)
+{
+ struct bpf_handler prev;
+ int i, extra;
+
+ memset(&prev, 0, sizeof prev);
+ if (dp->n_handlers) {
+ free(dp->handlers);
+ dp->handlers = NULL;
+ dp->n_handlers = 0;
+ }
+
+ if (!n_handlers) {
+ return 0;
+ }
+
+ dp->handlers = xzalloc(sizeof *dp->handlers * n_handlers);
+ for (i = 0; i < n_handlers; i++) {
+ struct bpf_handler *curr = dp->handlers + i;
+
+ if (i > dp->n_channels) {
+ VLOG_INFO("Ignoring extraneous handlers (%d for %d channels)",
+ n_handlers, dp->n_channels);
+ break;
+ }
+
+ curr->offset = prev.offset + prev.count;
+ curr->count = dp->n_channels / n_handlers;
+ prev = *curr;
+ }
+ extra = dp->n_channels % n_handlers;
+ if (extra) {
+ VLOG_INFO("Extra %d channels; distributing across handlers", extra);
+ for (i = 0; i < extra; i++) {
+ struct bpf_handler *curr = dp->handlers + n_handlers - i - 1;
+
+ curr->offset = curr->offset + extra - i - 1;
+ curr->count++;
+ }
+ }
+
+ dp->n_handlers = n_handlers;
+ return 0;
+}
+
+static int
+dpif_bpf_handlers_set(struct dpif *dpif_, uint32_t n_handlers)
+{
+ struct dpif_bpf_dp *dpif = get_dpif_bpf_dp(dpif_);
+ int error;
+
+ fat_rwlock_wrlock(&dpif->upcall_lock);
+ error = dpif_bpf_handlers_set__(dpif, n_handlers);
+ fat_rwlock_unlock(&dpif->upcall_lock);
+
+ return error;
+}
+
+static int
+extract_key(struct dpif_bpf_dp *dpif, const struct bpf_flow_key *key,
+ struct dp_packet *packet, struct ofpbuf *buf)
+{
+ struct flow flow;
+ struct odp_flow_key_parms parms = {
+ .flow = &flow,
+ };
+ parms.support.recirc = true;
+
+ {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+
+ bpf_flow_key_format(&ds, key);
+ VLOG_INFO("bpf_flow_key_format\n%s", ds_cstr(&ds));
+ ds_destroy(&ds);
+ }
+
+ /* This function goes first because it zeros out flow. */
+ flow_extract(packet, &flow);
+
+ bpf_flow_key_extract_metadata(key, &flow);
+
+ VLOG_INFO("packet.md.port = %d", packet->md.in_port.odp_port);
+
+ if (flow.in_port.odp_port != 0) {
+ flow.in_port.odp_port = ifindex_to_odp(dpif,
+ odp_to_u32(flow.in_port.odp_port));
+ } else {
+ flow.in_port.odp_port = packet->md.in_port.odp_port;
+ }
+ VLOG_INFO("flow.in_port.odp_port %d", flow.in_port.odp_port);
+
+ if (1) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+
+ flow_format(&ds, &flow, NULL);
+ VLOG_WARN("Upcall flow:\n%s",
+ ds_cstr(&ds));
+ ds_destroy(&ds);
+
+ }
+
+ odp_flow_key_from_flow(&parms, buf);
+
+ return 0;
+}
+
+struct ovs_ebpf_event {
+ struct perf_event_raw sample;
+ struct bpf_upcall header;
+ uint8_t data[];
+};
+
+static void OVS_UNUSED
+dpif_bpf_flow_dump_all(struct dpif_bpf_dp *dp OVS_UNUSED)
+{
+ struct dpif_bpf_flow_dump dump;
+ int err;
+
+ memset(&dump, 0, sizeof dump);
+ while (1) {
+ err = bpf_map_get_next_key(datapath.bpf.flow_table.fd,
+ &dump.pos, &dump.pos);
+ if (err) {
+ VLOG_INFO("err is %d", err);
+ break;
+ }
+ vlog_hex_dump((unsigned char *)&dump.pos, sizeof dump.pos);
+ }
+}
+
+/* perf_channel_read() fills the first part of 'buffer' with the full event.
+ * Here, the key will be extracted immediately following it, and 'upcall'
+ * will be initialized to point within 'buffer'.
+ */
+static int
+perf_sample_to_upcall__(struct dpif_bpf_dp *dp, struct ovs_ebpf_event *e,
+ struct dpif_upcall *upcall, struct ofpbuf *buffer)
+{
+ size_t sample_len = e->sample.size - sizeof e->header;
+ size_t pkt_len = e->header.skb_len;
+ size_t pre_key_len;
+ odp_port_t port_no;
+ int err;
+
+ if (pkt_len < ETH_HEADER_LEN) {
+ VLOG_WARN_RL(&rl, "Unexpectedly short packet (%"PRIuSIZE")", pkt_len);
+ return EINVAL;
+ }
+ if (e->sample.size - sizeof e->header < pkt_len) {
+ VLOG_WARN_RL(&rl,
+ "Packet longer than sample (pkt=%"PRIuSIZE", sample=%"PRIuSIZE")",
+ pkt_len, sample_len);
+ return EINVAL;
+ }
+
+ port_no = ifindex_to_odp(dp, e->header.ifindex);
+ VLOG_INFO("ifindex %d odp %d", e->header.ifindex, port_no);
+ if (port_no == ODPP_NONE) {
+ VLOG_WARN_RL(&rl, "failed to map upcall ifindex=%d to odp",
+ e->header.ifindex);
+ return EINVAL;
+ }
+
+ memset(upcall, 0, sizeof *upcall);
+
+ /* Use buffer->header to point to the packet, and buffer->msg to point to
+ * the extracted flow key. Therefore, when extract_key() reallocates
+ * 'buffer', we can easily get pointers back to the packet and start of
+ * extracted key. */
+ buffer->header = e->data;
+ buffer->msg = ofpbuf_tail(buffer);
+ pre_key_len = buffer->size;
+
+ VLOG_INFO("upcall key hex\n");
+ vlog_hex_dump((unsigned char *)&e->header.key, sizeof e->header.key);
+ //VLOG_INFO("list of bpf keys\n");
+ //dpif_bpf_flow_dump_all(dp);
+ VLOG_INFO("raw packet data in e->data");
+ vlog_hex_dump(e->data, MIN(pkt_len, 100));
+
+ dp_packet_use_stub(&upcall->packet, e->data, pkt_len);
+ dp_packet_set_size(&upcall->packet, pkt_len);
+ pkt_metadata_init(&upcall->packet.md, port_no);
+
+ err = extract_key(dp, &e->header.key, &upcall->packet, buffer);
+ if (err) {
+ return err;
+ }
+
+ upcall->key = buffer->msg;
+ upcall->key_len = buffer->size - pre_key_len;
+ dpif_flow_hash(dp->dpif, upcall->key, upcall->key_len, &upcall->ufid);
+
+ return 0;
+}
+
+/* perf_channel_read() fills the first part of 'buffer' with the full event.
+ * Here, the key will be extracted immediately following it, and 'upcall'
+ * will be initialized to point within 'buffer'.
+ */
+static int
+perf_sample_to_upcall_miss(struct dpif_bpf_dp *dp, struct ovs_ebpf_event *e,
+ struct dpif_upcall *upcall, struct ofpbuf *buffer)
+{
+ int err;
+
+ err = perf_sample_to_upcall__(dp, e, upcall, buffer);
+ if (err) {
+ return err;
+ }
+
+ ofpbuf_prealloc_tailroom(buffer, sizeof(struct bpf_downcall));
+ upcall->type = DPIF_UC_MISS;
+
+ return 0;
+}
+
+/* Modified from perf_sample_to_upcall.
+ */
+static int
+perf_sample_to_upcall_userspace(struct dpif_bpf_dp *dp, struct ovs_ebpf_event *e,
+ struct dpif_upcall *upcall,
+ struct ofpbuf *buffer)
+{
+ const struct nlattr *actions = (struct nlattr *)e->header.uactions;
+ const struct nlattr *a;
+ unsigned int left;
+ int err;
+
+ err = perf_sample_to_upcall__(dp, e, upcall, buffer);
+ if (err) {
+ return err;
+ }
+
+ NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, e->header.uactions_len) {
+ switch (nl_attr_type(a)) {
+ case OVS_USERSPACE_ATTR_PID:
+ //nl_attr_get_u32(a);
+ break;
+ case OVS_USERSPACE_ATTR_USERDATA:
+ upcall->userdata = CONST_CAST(struct nlattr *, a);
+ break;
+ default:
+ VLOG_INFO("%s unsupported userspace action. %d",
+ __func__, nl_attr_type(a));
+ return EOPNOTSUPP;
+ }
+ }
+
+ upcall->type = DPIF_UC_ACTION;
+ return 0;
+}
+
+static void
+bpf_debug_print(int subtype, int error)
+{
+ int level = error ? VLL_WARN : VLL_DBG;
+ struct ds ds = DS_EMPTY_INITIALIZER;
+
+ if (subtype >= 0 && subtype < ARRAY_SIZE(bpf_upcall_subtypes)) {
+ ds_put_cstr(&ds, bpf_upcall_subtypes[subtype]);
+ } else {
+ ds_put_format(&ds, "Unknown subtype %d", subtype);
+ }
+ ds_put_format(&ds, " reports: %s", ovs_strerror(error));
+
+ VLOG_RL(&rl, level, "%s", ds_cstr(&ds));
+ ds_destroy(&ds);
+}
+
+static int
+recv_perf_sample(struct dpif_bpf_dp *dpif, struct ovs_ebpf_event *e,
+ struct dpif_upcall *upcall, struct ofpbuf *buffer)
+{
+ if (e->sample.header.size < sizeof *e
+ || e->sample.size < sizeof e->header) {
+ VLOG_WARN_RL(&rl, "Unexpectedly short sample (%"PRIu32")",
+ e->sample.size);
+ return EINVAL;
+ }
+
+ VLOG_INFO("\nreceived upcall %d", e->header.type);
+
+ switch (e->header.type) {
+ case OVS_UPCALL_MISS:
+ return perf_sample_to_upcall_miss(dpif, e, upcall, buffer);
+ break;
+ case OVS_UPCALL_DEBUG:
+ bpf_debug_print(e->header.subtype, e->header.error);
+ return EAGAIN;
+ case OVS_UPCALL_ACTION:
+ return perf_sample_to_upcall_userspace(dpif, e, upcall, buffer);
+ break;
+ default:
+ break;
+ }
+
+ VLOG_WARN_RL(&rl, "Unfamiliar upcall type %d", e->header.type);
+ return EINVAL;
+}
+
+static int
+dpif_bpf_recv(struct dpif *dpif_, uint32_t handler_id,
+ struct dpif_upcall *upcall, struct ofpbuf *buffer)
+{
+ struct dpif_bpf_dp *dpif = get_dpif_bpf_dp(dpif_);
+ struct bpf_handler *handler;
+ int error = EAGAIN;
+ int i;
+
+ fat_rwlock_rdlock(&dpif->upcall_lock);
+ handler = dpif->handlers + handler_id;
+ for (i = 0; i < handler->count; i++) {
+ int channel_idx = (handler->index + i) % handler->count;
+ struct perf_channel *channel;
+
+ channel = &dpif->channels[handler->offset + channel_idx];
+ error = perf_channel_read(channel, buffer);
+ if (!error) {
+ error = recv_perf_sample(dpif, buffer->header, upcall, buffer);
+ }
+ if (error != EAGAIN) {
+ break;
+ }
+ }
+ handler->index = (handler->index + 1) % handler->count;
+ fat_rwlock_unlock(&dpif->upcall_lock);
+
+ return error;
+}
+
+static char *
+dpif_bpf_get_datapath_version(void)
+{
+ return xstrdup("<built-in>");
+}
+
+static void
+dpif_bpf_recv_wait(struct dpif *dpif_, uint32_t handler_id)
+{
+ struct dpif_bpf_dp *dpif = get_dpif_bpf_dp(dpif_);
+ struct bpf_handler *handler;
+ int i;
+
+ fat_rwlock_rdlock(&dpif->upcall_lock);
+ handler = dpif->handlers + handler_id;
+ for (i = 0; i < handler->count; i++) {
+ poll_fd_wait(dpif->channels[handler->offset + i].fd, POLLIN);
+ }
+ fat_rwlock_unlock(&dpif->upcall_lock);
+}
+
+static void
+dpif_bpf_recv_purge(struct dpif *dpif_)
+{
+ struct dpif_bpf_dp *dpif = get_dpif_bpf_dp(dpif_);
+ int i;
+
+ fat_rwlock_rdlock(&dpif->upcall_lock);
+ for (i = 0; i < dpif->n_channels; i++) {
+ struct perf_channel *channel = &dpif->channels[i];
+
+ perf_channel_flush(channel);
+ }
+ fat_rwlock_unlock(&dpif->upcall_lock);
+}
+
+const struct dpif_class dpif_bpf_class = {
+ "bpf",
+ dpif_bpf_init,
+ dpif_bpf_enumerate,
+ dpif_bpf_port_open_type,
+ dpif_bpf_open,
+ dpif_bpf_close,
+ dpif_bpf_destroy,
+ NULL, /* run */
+ NULL, /* wait */
+ dpif_bpf_get_stats,
+ dpif_bpf_port_add,
+ dpif_bpf_port_del,
+ NULL, /* port_set_config */
+ dpif_bpf_port_query_by_number,
+ dpif_bpf_port_query_by_name,
+ NULL, /* port_get_pid */
+ dpif_bpf_port_dump_start,
+ dpif_bpf_port_dump_next,
+ dpif_bpf_port_dump_done,
+ dpif_bpf_port_poll,
+ dpif_bpf_port_poll_wait,
+ dpif_bpf_flow_flush,
+ dpif_bpf_flow_dump_create,
+ dpif_bpf_flow_dump_destroy,
+ dpif_bpf_flow_dump_thread_create,
+ dpif_bpf_flow_dump_thread_destroy,
+ dpif_bpf_flow_dump_next,
+ dpif_bpf_operate,
+ dpif_bpf_recv_set,
+ dpif_bpf_handlers_set,
+ NULL, /* set_config */
+ NULL, /* queue_to_priority */
+ dpif_bpf_recv,
+ dpif_bpf_recv_wait,
+ dpif_bpf_recv_purge,
+ NULL, /* register_dp_purge_cb */
+ NULL, /* register_upcall_cb */
+ NULL, /* enable_upcall */
+ NULL, /* disable_upcall */
+ dpif_bpf_get_datapath_version,
+ NULL, /* ct_dump_start */
+ NULL, /* ct_dump_next */
+ NULL, /* ct_dump_done */
+ NULL, /* ct_flush */
+ NULL, /* ct_set_maxconns */
+ NULL, /* ct_get_maxconns */
+ NULL, /* ct_get_nconns */
+ NULL, /* meter_get_features */
+ NULL, /* meter_set */
+ NULL, /* meter_get */
+ NULL, /* meter_del */
+};
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
index 62b3598acfc5..ae21593ab1b2 100644
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -476,6 +476,7 @@ struct dpif_class {

extern const struct dpif_class dpif_netlink_class;
extern const struct dpif_class dpif_netdev_class;
+extern const struct dpif_class dpif_bpf_class;

#ifdef __cplusplus
}
diff --git a/lib/dpif.c b/lib/dpif.c
index f03763ec55b4..43d97ec1582a 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -71,6 +71,9 @@ static const struct dpif_class *base_dpif_classes[] = {
#if defined(__linux__) || defined(_WIN32)
&dpif_netlink_class,
#endif
+#if HAVE_BPF /* XXX: Linux 4.9+ */
+ &dpif_bpf_class,
+#endif
&dpif_netdev_class,
};

--
2.7.4