Add support for the BPF_PROG_TYPE_PHYS_DEV ook in mlx5e driver.
This is relevant for CX4Lx, with striding RQ, non fragmented wqes.
For fast RX drop, single RX core, we get 32M PPS,
while early drop (without bpf program) we get 34M PPS.
Signed-off-by: Rana Shahout <ranas@...>
---
Submitting as RFC, since this patch is based on v1 of XDP
series patch set from Brenden Blanco.
We are working now on rebasing this patch on latest XDP support
patch set and extending mlx5e driver to support TX forward case.
We will submit the final patch once Brenden's patch set get accepted.
drivers/net/ethernet/mellanox/mlx5/core/en.h | 11 +++--
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 57 +++++++++++++++++++++++
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 54 ++++++++++++++++++---
include/uapi/linux/bpf.h | 13 ++++++
4 files changed, 125 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 4cbd452..b073c68 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -242,7 +242,8 @@ struct mlx5e_cq {
struct mlx5e_rq;
typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq *rq,
- struct mlx5_cqe64 *cqe);
+ struct mlx5_cqe64 *cqe,
+ struct bpf_prog *prog);
typedef int (*mlx5e_fp_alloc_wqe)(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe,
u16 ix);
@@ -609,6 +610,7 @@ struct mlx5e_priv {
/* priv data path fields - start */
struct mlx5e_sq **txq_to_sq_map;
int channeltc_to_txq_map[MLX5E_MAX_NUM_CHANNELS][MLX5E_MAX_NUM_TC];
+ struct bpf_prog *prog;
/* priv data path fields - end */
unsigned long state;
@@ -692,8 +694,10 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
void mlx5e_free_tx_descs(struct mlx5e_sq *sq);
void mlx5e_free_rx_descs(struct mlx5e_rq *rq);
-void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
-void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+ struct bpf_prog *prog);
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+ struct bpf_prog *prog);
bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix);
int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix);
@@ -767,6 +771,7 @@ int mlx5e_get_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
u8 cq_period_mode);
+int mlx5e_call_bpf(struct bpf_prog *prog, void *data, unsigned int length);
static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
struct mlx5_wqe_ctrl_seg *ctrl, int bf_sz)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 611ab55..10a5613 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -34,6 +34,7 @@
#include <net/pkt_cls.h>
#include <linux/mlx5/fs.h>
#include <net/vxlan.h>
+#include <linux/bpf.h>
#include "en.h"
#include "en_tc.h"
#include "eswitch.h"
@@ -1883,6 +1884,9 @@ int mlx5e_close_locked(struct net_device *netdev)
mlx5e_redirect_rqts(priv);
mlx5e_close_channels(priv);
+ if (priv->prog)
+ bpf_prog_put(priv->prog);
+
return 0;
}
@@ -2786,6 +2790,55 @@ static void mlx5e_tx_timeout(struct net_device *dev)
schedule_work(&priv->tx_timeout_work);
}
+static DEFINE_PER_CPU(struct sk_buff, percpu_bpf_phys_dev_md);
+
+static void build_bpf_phys_dev_md(struct sk_buff *skb, void *data,
+ unsigned int length)
+{
+ /* data_len is intentionally not set here so that skb_is_nonlinear()
+ * returns false
+ */
+
+ skb->len = length;
+ skb->head = data;
+ skb->data = data;
+}
+
+int mlx5e_call_bpf(struct bpf_prog *prog, void *data, unsigned int length)
+{
+ struct sk_buff *skb = this_cpu_ptr(&percpu_bpf_phys_dev_md);
+ int ret;
+
+ build_bpf_phys_dev_md(skb, data, length);
+
+ rcu_read_lock();
+ ret = BPF_PROG_RUN(prog, (void *)skb);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int mlx5e_bpf_set(struct net_device *netdev, struct bpf_prog *prog)
+{
+ struct mlx5e_priv *priv = netdev_priv(netdev);
+ struct bpf_prog *old_prog;
+
+ old_prog = xchg(&priv->prog, prog);
+ if (old_prog) {
+ synchronize_net();
+ bpf_prog_put(old_prog);
+ }
+
+ return 0;
+}
+
+static bool mlx5e_bpf_get(struct net_device *netdev)
+{
+ struct mlx5e_priv *priv = netdev_priv(netdev);
+
+ return !!priv->prog;
+}
+
static const struct net_device_ops mlx5e_netdev_ops_basic = {
.ndo_open = mlx5e_open,
.ndo_stop = mlx5e_close,
@@ -2805,6 +2858,8 @@ static const struct net_device_ops mlx5e_netdev_ops_basic = {
.ndo_rx_flow_steer = mlx5e_rx_flow_steer,
#endif
.ndo_tx_timeout = mlx5e_tx_timeout,
+ .ndo_bpf_set = mlx5e_bpf_set,
+ .ndo_bpf_get = mlx5e_bpf_get,
};
static const struct net_device_ops mlx5e_netdev_ops_sriov = {
@@ -2836,6 +2891,8 @@ static const struct net_device_ops mlx5e_netdev_ops_sriov = {
.ndo_set_vf_link_state = mlx5e_set_vf_link_state,
.ndo_get_vf_stats = mlx5e_get_vf_stats,
.ndo_tx_timeout = mlx5e_tx_timeout,
+ .ndo_bpf_set = mlx5e_bpf_set,
+ .ndo_bpf_get = mlx5e_bpf_get,
};
static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 9f2a16a..b297675 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -131,7 +131,7 @@ static inline u32 mlx5e_decompress_cqes_cont(struct mlx5e_rq *rq,
mlx5e_read_mini_arr_slot(cq, cqcc);
mlx5e_decompress_cqe_no_hash(rq, cq, cqcc);
- rq->handle_rx_cqe(rq, &cq->title);
+ rq->handle_rx_cqe(rq, &cq->title, rq->priv->prog);
}
mlx5e_cqes_update_owner(cq, cq->wq.cc, cqcc - cq->wq.cc);
cq->wq.cc = cqcc;
@@ -148,7 +148,7 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
mlx5e_read_title_slot(rq, cq, cq->wq.cc);
mlx5e_read_mini_arr_slot(cq, cq->wq.cc + 1);
mlx5e_decompress_cqe(rq, cq, cq->wq.cc);
- rq->handle_rx_cqe(rq, &cq->title);
+ rq->handle_rx_cqe(rq, &cq->title, rq->priv->prog);
cq->mini_arr_idx++;
return mlx5e_decompress_cqes_cont(rq, cq, 1, budget_rem) - 1;
@@ -786,13 +786,13 @@ static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
u32 cqe_bcnt,
struct sk_buff *skb)
{
- rq->stats.packets++;
rq->stats.bytes += cqe_bcnt;
mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb);
napi_gro_receive(rq->cq.napi, skb);
}
-void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+ struct bpf_prog *prog)
{
struct mlx5e_rx_wqe *wqe;
struct sk_buff *skb;
@@ -822,6 +822,8 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
skb_put(skb, cqe_bcnt);
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+ rq->stats.packets++;
+
wq_ll_pop:
mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
@@ -868,7 +870,8 @@ static inline void mlx5e_mpwqe_fill_rx_skb(struct mlx5e_rq *rq,
skb->len += headlen;
}
-void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+ struct bpf_prog *prog)
{
u16 cstrides = mpwrq_get_cqe_consumed_strides(cqe);
u16 wqe_id = be16_to_cpu(cqe->wqe_id);
@@ -889,6 +892,38 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
goto mpwrq_cqe_out;
}
+
+ /* A bpf program gets first chance to drop the packet. It may
+ * read bytes but not past the end of the frag.
+ */
+ cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
+
+ if (prog) {
+ enum bpf_phys_dev_action act;
+ u32 consumed_bytes = ALIGN(cqe_bcnt, rq->mpwqe_stride_sz);
+ u32 wqe_offset = mpwrq_get_cqe_stride_index(cqe) *
+ rq->mpwqe_stride_sz;
+ u32 page_idx = wqe_offset >> PAGE_SHIFT;
+ struct mlx5e_dma_info *dma_info = &wi->dma_info;
+ void *data;
+
+ wi->dma_pre_sync(rq->pdev, wi, wqe_offset, consumed_bytes);
+
+
+ data = page_address(&dma_info->page[page_idx]) +
+ (wqe_offset & (PAGE_SIZE - 1));
+
+ act = mlx5e_call_bpf(prog, data, consumed_bytes);
+ switch (act) {
+ case BPF_PHYS_DEV_PASS:
+ break;
+ case BPF_PHYS_DEV_TX:
+ case BPF_PHYS_DEV_DROP:
+ default:
+ goto mpwrq_cqe_out;
+ }
+ }
+
skb = napi_alloc_skb(rq->cq.napi,
ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD,
sizeof(long)));
@@ -898,12 +933,14 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
}
prefetch(skb->data);
- cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
mlx5e_mpwqe_fill_rx_skb(rq, cqe, wi, cqe_bcnt, skb);
+
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
mpwrq_cqe_out:
+ rq->stats.packets++;
+
if (likely(wi->consumed_strides < rq->mpwqe_num_strides))
return;
@@ -914,6 +951,8 @@ mpwrq_cqe_out:
int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
{
struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
+ struct mlx5e_priv *priv = rq->priv;
+ struct bpf_prog *prog;
int work_done = 0;
if (unlikely(test_bit(MLX5E_RQ_STATE_FLUSH_TIMEOUT, &rq->state)))
@@ -922,6 +961,7 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
if (cq->decmprs_left)
work_done += mlx5e_decompress_cqes_cont(rq, cq, 0, budget);
+ prog = READ_ONCE(priv->prog);
for (; work_done < budget; work_done++) {
struct mlx5_cqe64 *cqe = mlx5e_get_cqe(cq);
@@ -937,7 +977,7 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
mlx5_cqwq_pop(&cq->wq);
- rq->handle_rx_cqe(rq, cqe);
+ rq->handle_rx_cqe(rq, cqe, prog);
}
mlx5_cqwq_update_db_record(&cq->wq);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a8f1ea1..513e317 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -426,6 +426,19 @@ struct __sk_buff {
__u32 data_end;
};
+/* user return codes for PHYS_DEV prog type */
+enum bpf_phys_dev_action {
+ BPF_PHYS_DEV_DROP,
+ BPF_PHYS_DEV_OK,
+};
+
+/* user accessible metadata for PHYS_DEV packet hook
+ * new fields must be added to the end of this structure
+ */
+struct bpf_phys_dev_md {
+ __u32 len;
+};
+
struct bpf_tunnel_key {
__u32 tunnel_id;
union {
--
1.8.3.1