[RFC PATCHv2 02/13] netdev: add ebpf support for netdev provider.


William Tu
 

From: Joe Stringer <joe@...>

To receive packets, an eBPF program has to be attached to a netdev
through tc ingress/egress, an XDP program has to be attached to
a netdev's xdp hook point. The patch introduces two new netdev_class
function: set_filter and set_xdp for the purpose. Now two netdev
types, netdev-linux and netdev-vport, have the actual implementation.

Signed-off-by: William Tu <u9012063@...>
Co-authored-by: William Tu <u9012063@...>
Co-authored-by: Yifeng Sun <pkusunyifeng@...>
---
include/linux/pkt_cls.h | 21 +++
lib/dpif-netdev.c | 29 ++--
lib/netdev-bsd.c | 2 +
lib/netdev-dpdk.c | 2 +
lib/netdev-dummy.c | 2 +
lib/netdev-linux.c | 436 +++++++++++++++++++++++++++++++++++++++++++++++-
lib/netdev-linux.h | 2 +
lib/netdev-provider.h | 11 ++
lib/netdev-vport.c | 145 +++++++++++++++-
lib/netdev.c | 25 +++
lib/netdev.h | 4 +
11 files changed, 655 insertions(+), 24 deletions(-)

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index f7bc7ea708d7..770af90a5c64 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -104,6 +104,27 @@ enum {
__TCA_BASIC_MAX
};

+/* BPF classifier */
+
+#define TCA_BPF_FLAG_ACT_DIRECT (1 << 0)
+
+enum {
+ TCA_BPF_UNSPEC,
+ TCA_BPF_ACT,
+ TCA_BPF_POLICE,
+ TCA_BPF_CLASSID,
+ TCA_BPF_OPS_LEN,
+ TCA_BPF_OPS,
+ TCA_BPF_FD,
+ TCA_BPF_NAME,
+ TCA_BPF_FLAGS,
+ TCA_BPF_FLAGS_GEN,
+ TCA_BPF_TAG,
+ __TCA_BPF_MAX,
+};
+
+#define TCA_BPF_MAX (__TCA_BPF_MAX - 1)
+
/* Flower classifier */

enum {
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index ba62128c758c..baff020fe3d0 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1505,12 +1505,6 @@ dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
ovs_mutex_unlock(&pmd->cond_mutex);
}

-static uint32_t
-hash_port_no(odp_port_t port_no)
-{
- return hash_int(odp_to_u32(port_no), 0);
-}
-
static int
port_create(const char *devname, const char *type,
odp_port_t port_no, struct dp_netdev_port **portp)
@@ -1525,6 +1519,7 @@ port_create(const char *devname, const char *type,

/* Open and validate network device. */
error = netdev_open(devname, type, &netdev);
+ VLOG_INFO("%s %s error %d", __func__, devname, error);
if (error) {
return error;
}
@@ -1578,7 +1573,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
return error;
}

- hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
+ hmap_insert(&dp->ports, &port->node, netdev_hash_port_no(port_no));
seq_change(dp->port_seq);

reconfigure_datapath(dp);
@@ -1596,6 +1591,8 @@ dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
odp_port_t port_no;
int error;

+ VLOG_INFO("%s", __func__);
+
ovs_mutex_lock(&dp->port_mutex);
dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
if (*port_nop != ODPP_NONE) {
@@ -1648,7 +1645,8 @@ dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
{
struct dp_netdev_port *port;

- HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
+ HMAP_FOR_EACH_WITH_HASH (port, node, netdev_hash_port_no(port_no),
+ &dp->ports) {
if (port->port_no == port_no) {
return port;
}
@@ -1808,7 +1806,7 @@ dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
odp_port_t in_port)
{
struct dpcls *cls;
- uint32_t hash = hash_port_no(in_port);
+ uint32_t hash = netdev_hash_port_no(in_port);
CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
if (cls->in_port == in_port) {
/* Port classifier exists already */
@@ -1824,7 +1822,7 @@ dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
OVS_REQUIRES(pmd->flow_mutex)
{
struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
- uint32_t hash = hash_port_no(in_port);
+ uint32_t hash = netdev_hash_port_no(in_port);

if (!cls) {
/* Create new classifier for in_port */
@@ -3311,7 +3309,7 @@ tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
{
struct tx_port *tx;

- HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
+ HMAP_FOR_EACH_IN_BUCKET (tx, node, netdev_hash_port_no(port_no), hmap) {
if (tx->port->port_no == port_no) {
return tx;
}
@@ -4034,13 +4032,13 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
- hash_port_no(tx_port_cached->port->port_no));
+ netdev_hash_port_no(tx_port_cached->port->port_no));
}

if (netdev_n_txq(tx_port->port->netdev)) {
tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
- hash_port_no(tx_port_cached->port->port_no));
+ netdev_hash_port_no(tx_port_cached->port->port_no));
}
}
}
@@ -4793,7 +4791,8 @@ dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
tx->flush_time = 0LL;
dp_packet_batch_init(&tx->output_pkts);

- hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
+ hmap_insert(&pmd->tx_ports, &tx->node,
+ netdev_hash_port_no(tx->port->port_no));
pmd->need_reload = true;
}

@@ -5965,7 +5964,7 @@ dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,

/* Reinsert with new port number. */
port->port_no = port_no;
- hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
+ hmap_insert(&dp->ports, &port->node, netdev_hash_port_no(port_no));
reconfigure_datapath(dp);

seq_change(dp->port_seq);
diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
index 05974c100895..1460ae2504c5 100644
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -1516,6 +1516,8 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off,
NULL, /* set_advertisement */ \
NULL, /* get_pt_mode */ \
NULL, /* set_policing */ \
+ NULL, /* set_filter */ \
+ NULL, /* set_xdp */ \
NULL, /* get_qos_type */ \
NULL, /* get_qos_capabilities */ \
NULL, /* get_qos */ \
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 52d8fe6b7ac2..20116c22137e 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -3854,6 +3854,8 @@ unlock:
NULL, /* get_pt_mode */ \
\
netdev_dpdk_set_policing, \
+ NULL, /* set_filter */ \
+ NULL, /* set_xdp */ \
netdev_dpdk_get_qos_types, \
NULL, /* get_qos_capabilities */ \
netdev_dpdk_get_qos, \
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index 4246af3b9c86..44c9458a9a22 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1427,6 +1427,8 @@ netdev_dummy_update_flags(struct netdev *netdev_,
NULL, /* get_pt_mode */ \
\
NULL, /* set_policing */ \
+ NULL, /* set_filter */ \
+ NULL, /* set_xdp */ \
NULL, /* get_qos_types */ \
NULL, /* get_qos_capabilities */ \
NULL, /* get_qos */ \
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 4e0473cf331f..121dd3bc738e 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -46,6 +46,9 @@
#include <string.h>
#include <unistd.h>

+#include <bpf/libbpf.h> /* linux/tools/bpf/libbpf.h */
+
+#include "bpf.h"
#include "coverage.h"
#include "dp-packet.h"
#include "dpif-netlink.h"
@@ -227,6 +230,9 @@ enum {
VALID_VPORT_STAT_ERROR = 1 << 5,
VALID_DRVINFO = 1 << 6,
VALID_FEATURES = 1 << 7,
+ VALID_INGRESS_FILTER = 1 << 8,
+ VALID_EGRESS_FILTER = 1 << 9,
+ VALID_XDP_FILTER = 1 << 10,
};

/* Traffic control. */
@@ -421,6 +427,7 @@ static const struct tc_ops tc_ops_sfq;
static const struct tc_ops tc_ops_default;
static const struct tc_ops tc_ops_noop;
static const struct tc_ops tc_ops_other;
+static const struct tc_ops tc_ops_clsact;

static const struct tc_ops *const tcs[] = {
&tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
@@ -431,6 +438,7 @@ static const struct tc_ops *const tcs[] = {
&tc_ops_noop, /* Non operating qos type. */
&tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
&tc_ops_other, /* Some other qdisc. */
+ &tc_ops_clsact, /* Classifier with nested action. */
NULL
};

@@ -442,8 +450,12 @@ static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
int type,
unsigned int flags,
struct ofpbuf *);
+static int clsact_install__(struct netdev *netdev_);
static int tc_add_policer(struct netdev *,
uint32_t kbits_rate, uint32_t kbits_burst);
+static int tc_add_filter(struct netdev *, int fd, uint32_t parent,
+ const char *name);
+static bool tc_is_clsact(const struct tc *tc);

static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
struct nlattr **options);
@@ -485,13 +497,19 @@ struct netdev_linux {
long long int carrier_resets;
uint32_t kbits_rate; /* Policing data. */
uint32_t kbits_burst;
+ uint32_t ingress_filter; /* BPF ingress filter fd. */
+ uint32_t egress_filter; /* BPF egress filter fd. */
+ uint32_t ingress_xdp_filter;/* XDP ingress filter fd. */
int vport_stats_error; /* Cached error code from vport_get_stats().
0 or an errno value. */
int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
int ether_addr_error; /* Cached error code from set/get etheraddr. */
int netdev_policing_error; /* Cached error code from set policing. */
+ int ingress_filter_error; /* Cached error code from set filter. */
+ int egress_filter_error; /* Cached error code from set filter. */
int get_features_error; /* Cached error code from ETHTOOL_GSET. */
int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
+ int ingress_xdp_error;

enum netdev_features current; /* Cached from ETHTOOL_GSET. */
enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
@@ -2159,8 +2177,14 @@ netdev_linux_set_policing(struct netdev *netdev_,
if (kbits_rate) {
error = tc_add_del_ingress_qdisc(ifindex, true);
if (error) {
- VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
- netdev_name, ovs_strerror(error));
+ const char *bpf_conflict = "";
+
+ if (error == EEXIST && (netdev->ingress_filter
+ || netdev->egress_filter)) {
+ bpf_conflict = " (conflicts with BPF)";
+ }
+ VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s%s",
+ netdev_name, ovs_strerror(error), bpf_conflict);
goto out;
}

@@ -2184,6 +2208,268 @@ out:
return error;
}

+/* Attempts to set a BPF filter on the device. Returns 0 if successful,
+ * otherwise a positive errno value. */
+static int
+netdev_linux_set_filter__(struct netdev *netdev_, const struct bpf_prog *prog,
+ unsigned int valid_bit, int *filter_error,
+ uint32_t *netdev_filter)
+{
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ const char *netdev_name = netdev_get_name(netdev_);
+ int error;
+
+ VLOG_DBG("Setting %s filter %d on %s (handle %08"PRIx32")", prog->name,
+ prog->fd, netdev_name, prog->handle);
+
+ if (netdev->cache_valid & valid_bit) {
+ error = *filter_error;
+ if (error || (prog && prog->fd == *netdev_filter)) {
+ /* Assume that settings haven't changed since we last set them. */
+ goto out;
+ }
+ netdev->cache_valid &= ~valid_bit;
+ }
+
+ /* Remove non-clsact qdiscs. */
+ if (netdev->tc && !tc_is_clsact(netdev->tc)) {
+ error = tc_del_qdisc(netdev_);
+ if (error) {
+ VLOG_WARN_RL(&rl, "%s: removing qdisc failed: %s",
+ netdev_name, ovs_strerror(error));
+ goto out;
+ }
+ }
+
+ if (prog) {
+ if (!netdev->tc || !tc_is_clsact(netdev->tc)) {
+ error = clsact_install__(netdev_);
+ if (error && error != EEXIST) {
+ VLOG_WARN_RL(&rl, "%s: clsact qdisc setup failed: %s",
+ netdev_name, ovs_strerror(error));
+ goto out;
+ }
+ }
+
+ error = tc_add_filter(netdev_, prog->fd, prog->handle, prog->name);
+ if (error){
+ VLOG_WARN_RL(&rl, "%s: adding filter %s failed: %s",
+ netdev_name, prog->name, ovs_strerror(error));
+ goto out;
+ }
+ }
+
+ *netdev_filter = prog ? prog->fd : 0;
+
+out:
+ if (!error || error == ENODEV) {
+ *filter_error = error;
+ netdev->cache_valid |= valid_bit;
+ }
+ return error;
+}
+
+static int
+netdev_linux_set_filter(struct netdev *netdev_, const struct bpf_prog *prog)
+{
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ int error;
+
+ ovs_mutex_lock(&netdev->mutex);
+ if (!prog || prog->handle == INGRESS_HANDLE) {
+ error = netdev_linux_set_filter__(netdev_, prog, VALID_INGRESS_FILTER,
+ &netdev->ingress_filter_error,
+ &netdev->ingress_filter);
+ } else {
+ error = netdev_linux_set_filter__(netdev_, prog, VALID_EGRESS_FILTER,
+ &netdev->egress_filter_error,
+ &netdev->egress_filter);
+ }
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
+}
+
+#ifndef SOL_NETLINK
+#define SOL_NETLINK 270
+#endif
+
+/* Extract from libbpf */
+int
+bpf_set_link_xdp_fd(int ifindex, int fd, uint32_t flags)
+{
+
+ struct sockaddr_nl sa;
+ int sock, seq = 0, len, ret = -1;
+ char buf[4096];
+ struct nlattr *nla, *nla_xdp;
+ struct {
+ struct nlmsghdr nh;
+ struct ifinfomsg ifinfo;
+ char attrbuf[64];
+ } req;
+ struct nlmsghdr *nh;
+ struct nlmsgerr *err;
+ socklen_t addrlen;
+ int one = 1;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.nl_family = AF_NETLINK;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ return -errno;
+ }
+
+ if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
+ &one, sizeof(one)) < 0) {
+ VLOG_WARN_RL(&rl, "Netlink error reporting not supported");
+ }
+
+ if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+ ret = -errno;
+ goto cleanup;
+ }
+
+ addrlen = sizeof(sa);
+ if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
+ ret = -errno;
+ goto cleanup;
+ }
+
+ if (addrlen != sizeof(sa)) {
+ goto cleanup;
+ }
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_type = RTM_SETLINK;
+ req.nh.nlmsg_pid = 0;
+ req.nh.nlmsg_seq = ++seq;
+ req.ifinfo.ifi_family = AF_UNSPEC;
+ req.ifinfo.ifi_index = ifindex;
+
+ /* started nested attribute for XDP */
+ nla = (struct nlattr *)(((char *)&req)
+ + NLMSG_ALIGN(req.nh.nlmsg_len));
+ nla->nla_type = NLA_F_NESTED | IFLA_XDP;
+ nla->nla_len = NLA_HDRLEN;
+
+ /* add XDP fd */
+ nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+ nla_xdp->nla_type = IFLA_XDP_FD;
+ nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
+ memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
+ nla->nla_len += nla_xdp->nla_len;
+
+ /* if user passed in any flags, add those too */
+ if (flags) {
+ nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+ nla_xdp->nla_type = IFLA_XDP_FLAGS;
+ nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
+ memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
+ nla->nla_len += nla_xdp->nla_len;
+ }
+
+ req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
+
+ /* send */
+ if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ ret = -errno;
+ goto cleanup;
+ }
+
+ /* recv */
+ len = recv(sock, buf, sizeof(buf), 0);
+ if (len < 0) {
+ ret = -errno;
+ goto cleanup;
+ }
+
+ for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+ nh = NLMSG_NEXT(nh, len)) {
+ if (nh->nlmsg_pid != sa.nl_pid) {
+ ret = -1;
+ goto cleanup;
+ }
+ if (nh->nlmsg_seq != seq) {
+ ret = -1;
+ goto cleanup;
+ }
+ switch (nh->nlmsg_type) {
+ case NLMSG_ERROR:
+ err = (struct nlmsgerr *)NLMSG_DATA(nh);
+ if (!err->error)
+ continue;
+ ret = err->error;
+ /* nla_dump_errormsg(nh); */
+ goto cleanup;
+ case NLMSG_DONE:
+ break;
+ default:
+ break;
+ }
+ }
+
+ ret = 0;
+
+cleanup:
+ close(sock);
+ return ret;
+}
+
+static int
+netdev_linux_set_xdp__(struct netdev *netdev_, const struct bpf_prog *prog,
+ unsigned int valid_bit, int *filter_error,
+ uint32_t *netdev_filter)
+{
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ const char *netdev_name = netdev_get_name(netdev_);
+ int ifindex = netdev->ifindex;
+ int error;
+
+ VLOG_DBG("Setting %s XDP filter %d on %s (ifindex %d)", prog->name,
+ prog->fd, netdev_name, ifindex);
+
+ if (netdev->cache_valid & valid_bit) {
+ error = *filter_error;
+ if (error || (prog && prog->fd == *netdev_filter)) {
+ /* Assume that settings haven't changed since we last set them. */
+ goto out;
+ }
+ netdev->cache_valid &= ~valid_bit;
+ }
+ error = bpf_set_link_xdp_fd(ifindex, prog->fd, XDP_FLAGS_SKB_MODE);
+ if (error < 0) {
+ VLOG_WARN_RL(&rl, "%s: adding XDP filter %s failed: %s",
+ netdev_name, prog->name, ovs_strerror(error));
+ goto out;
+ }
+
+out:
+ if (!error || error == ENODEV) {
+ *filter_error = error;
+ netdev->cache_valid |= valid_bit;
+ }
+ return error;
+}
+
+static int
+netdev_linux_set_xdp(struct netdev *netdev_, const struct bpf_prog *prog)
+{
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ int error;
+
+ ovs_mutex_lock(&netdev->mutex);
+ error = netdev_linux_set_xdp__(netdev_, prog, VALID_XDP_FILTER,
+ &netdev->ingress_xdp_error,
+ &netdev->ingress_xdp_filter);
+ ovs_mutex_unlock(&netdev->mutex);
+
+ return error;
+}
+
static int
netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
struct sset *types)
@@ -2879,6 +3165,8 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
NULL, /* get_pt_mode */ \
\
netdev_linux_set_policing, \
+ netdev_linux_set_filter, \
+ netdev_linux_set_xdp, \
netdev_linux_get_qos_types, \
netdev_linux_get_qos_capabilities, \
netdev_linux_get_qos, \
@@ -4671,6 +4959,74 @@ static const struct tc_ops tc_ops_other = {
NULL /* class_dump_stats */
};

+/* "linux-clsact" traffic control class. */
+static int
+clsact_setup_qdisc(struct netdev *netdev)
+{
+ struct ofpbuf request;
+ struct tcmsg *tcmsg;
+
+ tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
+ NLM_F_EXCL | NLM_F_CREATE, &request);
+ if (!tcmsg) {
+ return ENODEV;
+ }
+ tcmsg->tcm_handle = tc_make_handle(0xFFFF, 0);
+ tcmsg->tcm_parent = TC_H_INGRESS;
+ nl_msg_put_string(&request, TCA_KIND, "clsact");
+ nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
+
+ return tc_transact(&request, NULL);
+}
+
+static int
+clsact_install__(struct netdev *netdev_)
+{
+ static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_clsact);
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ int error;
+
+ error = clsact_setup_qdisc(netdev_);
+ if (error) {
+ return error;
+ }
+
+ /* Nothing but a tc class implementation is allowed to write to a tc. This
+ * class never does that, so we can legitimately use a const tc object. */
+ netdev->tc = CONST_CAST(struct tc *, &tc);
+
+ return 0;
+}
+
+static int
+clsact_tc_install(struct netdev *netdev,
+ const struct smap *details OVS_UNUSED)
+{
+ return clsact_install__(netdev);
+}
+
+static int
+clsact_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
+{
+ return clsact_install__(netdev);
+}
+
+static const struct tc_ops tc_ops_clsact = {
+ "clsact", /* linux_name */
+ "linux-clsact", /* ovs_name */
+ 0, /* n_queues */
+ clsact_tc_install,
+ clsact_tc_load,
+ NULL, /* tc_destroy */
+ NULL, /* qdisc_get */
+ NULL, /* qdisc_set */
+ NULL, /* class_get */
+ NULL, /* class_set */
+ NULL, /* class_delete */
+ NULL, /* class_get_stats */
+ NULL /* class_dump_stats */
+};
+
/* Traffic control. */

/* Number of kernel "tc" ticks per second. */
@@ -4775,6 +5131,49 @@ tc_add_policer(struct netdev *netdev,
return 0;
}

+/* Adds a filter to 'netdev' corresponding to BPF program associated with 'fd'.
+ *
+ * This function is equivalent to running:
+ * /sbin/tc filter add dev <devname> <parent> bpf da object-pinned <path>
+ *
+ * The configuration and stats may be seen with the following command:
+ * /sbin/tc -s filter show dev <devname> <parent>
+ *
+ * Returns 0 if successful, otherwise a positive errno value.
+ */
+static int
+tc_add_filter(struct netdev *netdev, int fd, uint32_t parent, const char *name)
+{
+ struct ofpbuf request;
+ struct tcmsg *tcmsg;
+ size_t opts_offset;
+ int error;
+
+ tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
+ NLM_F_EXCL | NLM_F_CREATE, &request);
+ if (!tcmsg) {
+ return ENODEV;
+ }
+ tcmsg->tcm_handle = tc_make_handle(0, 0x1);
+ tcmsg->tcm_parent = parent;
+ tcmsg->tcm_info = tc_make_handle(0, /* preference */
+ (OVS_FORCE uint16_t) htons(ETH_P_ALL));
+
+ nl_msg_put_string(&request, TCA_KIND, "bpf");
+ opts_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
+ nl_msg_put_u32(&request, TCA_BPF_FLAGS, TCA_BPF_FLAG_ACT_DIRECT);
+ nl_msg_put_u32(&request, TCA_BPF_FD, fd);
+ nl_msg_put_string(&request, TCA_BPF_NAME, name);
+ nl_msg_end_nested(&request, opts_offset);
+
+ error = tc_transact(&request, NULL);
+ if (error) {
+ return error;
+ }
+
+ return 0;
+}
+
static void
read_psched(void)
{
@@ -5060,21 +5459,21 @@ tc_delete_class(const struct netdev *netdev, unsigned int handle)
return error;
}

-/* Equivalent to "tc qdisc del dev <name> root". */
+/* Equivalent to "tc qdisc del dev <name> handle <handle> <parent>". */
static int
-tc_del_qdisc(struct netdev *netdev_)
+tc_del_qdisc__(struct netdev_linux *netdev, uint32_t parent, uint32_t handle)
{
- struct netdev_linux *netdev = netdev_linux_cast(netdev_);
struct ofpbuf request;
struct tcmsg *tcmsg;
int error;

- tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
+ tcmsg = netdev_linux_tc_make_request(&netdev->up, RTM_DELQDISC, 0,
+ &request);
if (!tcmsg) {
return ENODEV;
}
- tcmsg->tcm_handle = tc_make_handle(1, 0);
- tcmsg->tcm_parent = TC_H_ROOT;
+ tcmsg->tcm_handle = handle;
+ tcmsg->tcm_parent = parent;

error = tc_transact(&request, NULL);
if (error == EINVAL) {
@@ -5092,6 +5491,27 @@ tc_del_qdisc(struct netdev *netdev_)
}

static bool
+tc_is_clsact(const struct tc *tc)
+{
+ if (!tc || !tc->ops->linux_name) {
+ return false;
+ }
+ return !strcmp(tc->ops->linux_name, "clsact");
+}
+
+static int
+tc_del_qdisc(struct netdev *netdev_)
+{
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+
+ if (netdev->tc && tc_is_clsact(netdev->tc)) {
+ return tc_del_qdisc__(netdev, TC_H_INGRESS,
+ tc_make_handle(TC_H_INGRESS, 0));
+ }
+ return tc_del_qdisc__(netdev, TC_H_ROOT, tc_make_handle(1, 0));
+}
+
+static bool
getqdisc_is_safe(void)
{
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
diff --git a/lib/netdev-linux.h b/lib/netdev-linux.h
index 880f86402a1e..8257d4c695f9 100644
--- a/lib/netdev-linux.h
+++ b/lib/netdev-linux.h
@@ -29,6 +29,8 @@ int netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
const char *flag_name, bool enable);
int linux_get_ifindex(const char *netdev_name);

+int bpf_set_link_xdp_fd(int ifindex, int fd, uint32_t flags);
+
#define LINUX_FLOW_OFFLOAD_API \
netdev_tc_flow_flush, \
netdev_tc_flow_dump_create, \
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index 25bd671c1382..3e53a5b76272 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -32,6 +32,7 @@
extern "C" {
#endif

+struct bpf_prog;
struct netdev_tnl_build_header_params;
#define NETDEV_NUMA_UNSPEC OVS_NUMA_UNSPEC

@@ -505,6 +506,16 @@ struct netdev_class {
int (*set_policing)(struct netdev *netdev, unsigned int kbits_rate,
unsigned int kbits_burst);

+ /* Attempts to attach a traffic filter in the form of an (e)BPF program.
+ *
+ * This function may be set to null if filters are not supported. */
+ int (*set_filter)(struct netdev *netdev, const struct bpf_prog *);
+
+ /* Attempts to attach a XDP eBPF program.
+ *
+ * This function may be set to null if filters are not supported. */
+ int (*set_xdp)(struct netdev *netdev, const struct bpf_prog *);
+
/* Adds to 'types' all of the forms of QoS supported by 'netdev', or leaves
* it empty if 'netdev' does not support QoS. Any names added to 'types'
* should be documented as valid for the "type" column in the "QoS" table
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index 52aa12d79933..4341c89894a3 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -22,12 +22,14 @@
#include <errno.h>
#include <fcntl.h>
#include <sys/socket.h>
+#include <linux/rtnetlink.h>
#include <net/if.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
#include <sys/ioctl.h>

+#include "bpf.h"
#include "byte-order.h"
#include "daemon.h"
#include "dirs.h"
@@ -43,6 +45,7 @@
#include "route-table.h"
#include "smap.h"
#include "socket-util.h"
+#include "tc.h"
#include "unaligned.h"
#include "unixctl.h"
#include "openvswitch/vlog.h"
@@ -72,6 +75,10 @@ struct vport_class {
struct netdev_class netdev_class;
};

+/* This is set pretty low because we probably won't learn anything from the
+ * additional log messages. */
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+
bool
netdev_vport_is_vport_class(const struct netdev_class *class)
{
@@ -866,6 +873,140 @@ netdev_vport_get_ifindex(const struct netdev *netdev_)
return linux_get_ifindex(name);
}

+/* "linux-clsact" traffic control class. */
+static int
+clsact_setup_qdisc(struct netdev *netdev)
+{
+ struct ofpbuf request;
+ struct tcmsg *tcmsg;
+ int ifindex;
+
+ ifindex = netdev_vport_get_ifindex(netdev);
+
+ tcmsg = tc_make_request(ifindex, RTM_NEWQDISC, NLM_F_EXCL | NLM_F_CREATE,
+ &request);
+ if (!tcmsg) {
+ return ENODEV;
+ }
+ tcmsg->tcm_handle = tc_make_handle(0xFFFF, 0);
+ tcmsg->tcm_parent = TC_H_INGRESS;
+ nl_msg_put_string(&request, TCA_KIND, "clsact");
+ nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
+
+ return tc_transact(&request, NULL);
+}
+
+static int
+tc_add_filter(struct netdev *netdev, int fd, uint32_t parent, const char *name)
+{
+ struct ofpbuf request;
+ struct tcmsg *tcmsg;
+ size_t opts_offset;
+ int ifindex;
+ int error;
+
+ ifindex = netdev_vport_get_ifindex(netdev);
+
+ tcmsg = tc_make_request(ifindex, RTM_NEWTFILTER, NLM_F_EXCL | NLM_F_CREATE,
+ &request);
+ if (!tcmsg) {
+ return ENODEV;
+ }
+ tcmsg->tcm_handle = tc_make_handle(0, 0x1);
+ tcmsg->tcm_parent = parent;
+#define ETH_P_ALL 0x0003
+ tcmsg->tcm_info = tc_make_handle(0, /* preference */
+ (OVS_FORCE uint16_t) htons(ETH_P_ALL));
+
+ nl_msg_put_string(&request, TCA_KIND, "bpf");
+ opts_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
+ nl_msg_put_u32(&request, TCA_BPF_FLAGS, TCA_BPF_FLAG_ACT_DIRECT);
+ nl_msg_put_u32(&request, TCA_BPF_FD, fd);
+ nl_msg_put_string(&request, TCA_BPF_NAME, name);
+ nl_msg_end_nested(&request, opts_offset);
+
+ error = tc_transact(&request, NULL);
+ if (error) {
+ return error;
+ }
+
+ return 0;
+}
+
+/* Attempts to set a BPF filter on the device. Returns 0 if successful,
+ * otherwise a positive errno value. */
+static int
+netdev_vport_set_filter__(struct netdev *netdev_, const struct bpf_prog *prog,
+ unsigned int OVS_UNUSED valid_bit, int OVS_UNUSED *filter_error,
+ uint32_t OVS_UNUSED *netdev_filter)
+{
+ struct netdev_vport OVS_UNUSED *netdev = netdev_vport_cast(netdev_);
+ const char *netdev_name = netdev_get_name(netdev_);
+ int error;
+
+ if (!prog) {
+ return 0;
+ }
+
+ VLOG_DBG("Setting %s filter %d on %s (handle %08"PRIx32")", prog->name,
+ prog->fd, netdev_name, prog->handle);
+
+ error = clsact_setup_qdisc(netdev_);
+ if (error && error != EEXIST) {
+ VLOG_WARN("%s: clsact qdisc setup failed: %s",
+ netdev_name, ovs_strerror(error));
+ goto out;
+ }
+
+ error = tc_add_filter(netdev_, prog->fd, prog->handle, prog->name);
+ if (error){
+ VLOG_WARN_RL(&rl, "%s: adding filter %s failed: %s",
+ netdev_name, prog->name, ovs_strerror(error));
+ goto out;
+ }
+
+out:
+ VLOG_INFO("%s %d", __func__, error);
+ return error;
+}
+
+static int
+netdev_vport_set_filter(struct netdev *netdev_, const struct bpf_prog *prog)
+{
+ struct netdev_vport *netdev = netdev_vport_cast(netdev_);
+ int error = 0;
+
+ ovs_mutex_lock(&netdev->mutex);
+ if (!prog || prog->handle == INGRESS_HANDLE) {
+ error = netdev_vport_set_filter__(netdev_, prog, 0, NULL, NULL);
+ }
+ ovs_mutex_unlock(&netdev->mutex);
+
+ VLOG_INFO("%s %d", __func__, error);
+
+ return error;
+}
+
+int bpf_set_link_xdp_fd(int ifindex, int fd, uint32_t flags);
+
+static int
+netdev_vport_set_xdp(struct netdev *netdev_, const struct bpf_prog *prog)
+{
+ struct netdev_vport *netdev = netdev_vport_cast(netdev_);
+ int error = 0;
+ int ifindex;
+
+ ovs_mutex_lock(&netdev->mutex);
+ ifindex = netdev_vport_get_ifindex(netdev_);
+ error = bpf_set_link_xdp_fd(ifindex, prog->fd,
+ XDP_FLAGS_SKB_MODE);
+ ovs_mutex_unlock(&netdev->mutex);
+
+ VLOG_INFO("%s %d", __func__, error);
+
+ return error;
+}
+
#define NETDEV_VPORT_GET_IFINDEX netdev_vport_get_ifindex
#define NETDEV_FLOW_OFFLOAD_API LINUX_FLOW_OFFLOAD_API
#else /* !__linux__ */
@@ -914,6 +1055,8 @@ netdev_vport_get_ifindex(const struct netdev *netdev_)
get_pt_mode, \
\
NULL, /* set_policing */ \
+ netdev_vport_set_filter, /* set_filter */ \
+ netdev_vport_set_xdp, /* set_xdp */ \
NULL, /* get_qos_types */ \
NULL, /* get_qos_capabilities */ \
NULL, /* get_qos */ \
@@ -972,7 +1115,7 @@ netdev_vport_tunnel_register(void)
TUNNEL_CLASS("gre", "gre_sys", netdev_gre_build_header,
netdev_gre_push_header,
netdev_gre_pop_header,
- NULL),
+ NETDEV_VPORT_GET_IFINDEX),
TUNNEL_CLASS("vxlan", "vxlan_sys", netdev_vxlan_build_header,
netdev_tnl_push_udp_header,
netdev_vxlan_pop_header,
diff --git a/lib/netdev.c b/lib/netdev.c
index be05dc64024a..c44a1a683b92 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -759,6 +759,13 @@ netdev_get_pt_mode(const struct netdev *netdev)
: NETDEV_PT_LEGACY_L2);
}

+/* Returns a 32-bit hash of the given port number. */
+uint32_t
+netdev_hash_port_no(odp_port_t port_no)
+{
+ return hash_int(odp_to_u32(port_no), 0);
+}
+
/* Sends 'batch' on 'netdev'. Returns 0 if successful (for every packet),
* otherwise a positive errno value. Returns EAGAIN without blocking if
* at least one the packets cannot be queued immediately. Returns EMSGSIZE
@@ -1449,6 +1456,24 @@ netdev_set_policing(struct netdev *netdev, uint32_t kbits_rate,
: EOPNOTSUPP);
}

+/* Attempts to apply (e)BPF filter 'prog' to the netdev. */
+int
+netdev_set_filter(struct netdev *netdev, struct bpf_prog *prog)
+{
+ return (netdev->netdev_class->set_filter
+ ? netdev->netdev_class->set_filter(netdev, prog)
+ : EOPNOTSUPP);
+}
+
+/* Attempts to apply (e)BPF filter 'prog' to the netdev. */
+int
+netdev_set_xdp(struct netdev *netdev, struct bpf_prog *prog)
+{
+ return (netdev->netdev_class->set_xdp
+ ? netdev->netdev_class->set_xdp(netdev, prog)
+ : EOPNOTSUPP);
+}
+
/* Adds to 'types' all of the forms of QoS supported by 'netdev', or leaves it
* empty if 'netdev' does not support QoS. Any names added to 'types' should
* be documented as valid for the "type" column in the "QoS" table in
diff --git a/lib/netdev.h b/lib/netdev.h
index ff1b604b24e2..3388504d85c9 100644
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -59,6 +59,7 @@ extern "C" {
* netdev and access each of those from a different thread.)
*/

+struct bpf_prog;
struct dp_packet_batch;
struct dp_packet;
struct netdev_class;
@@ -167,6 +168,7 @@ bool netdev_mtu_is_user_config(struct netdev *);
int netdev_get_ifindex(const struct netdev *);
int netdev_set_tx_multiq(struct netdev *, unsigned int n_txq);
enum netdev_pt_mode netdev_get_pt_mode(const struct netdev *);
+uint32_t netdev_hash_port_no(odp_port_t port_no);

/* Packet reception. */
int netdev_rxq_open(struct netdev *, struct netdev_rxq **, int id);
@@ -316,6 +318,8 @@ struct netdev_queue_stats {

int netdev_set_policing(struct netdev *, uint32_t kbits_rate,
uint32_t kbits_burst);
+int netdev_set_filter(struct netdev *netdev, struct bpf_prog *prog);
+int netdev_set_xdp(struct netdev *netdev, struct bpf_prog *prog);

int netdev_get_qos_types(const struct netdev *, struct sset *types);
int netdev_get_qos_capabilities(const struct netdev *,
--
2.7.4