Topics

[PATCHv5 RFC 1/3] BPF: New helper to obtain namespace data from current task (FIX).

cnb
 

I'm sorry there was an error on the code, the spinlock was never released if kern_path()
failed.Here is the fix.

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 2de82d14424d..2a83eba63182 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -359,11 +359,11 @@ BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, pidns_info, u32,

spin_lock(&bpf_spinlock);
ret = kern_path(pidnspath, 0, &kp);
+ spin_unlock(&bpf_spinlock);
if (ret)
goto clear;
inode = d_backing_inode(kp.dentry);
pidns_info->dev = inode->i_sb->s_dev;
- spin_unlock(&bpf_spinlock);

return 0;

On Tue, Apr 16, 2019 at 09:31:35PM -0400, cnb via Lists.Iovisor.Org wrote:
As a bpf program cannot sleep, I needed to add a spinlock to kern_path, as
it calls getname_kernel() which may sleep.
The inode is accessed directly, as we are just interested in the inode's s_dev.
Let me know if this approach is the correct one.
-------------------------------------------------------------------------------
From 35b7bfcbf6524ec807f107302209a0fb07614cc8 Mon Sep 17 00:00:00 2001
From: Carlos <cneirabustos@...>
Date: Tue, 16 Apr 2019 17:10:46 -0400
Subject: [PATCH] [PATCH bpf-next 1/3] BPF: New helper to obtain namespace data
from current task

This helper obtains the active namespace from current and returns pid, tgid,
device and namespace id as seen from that namespace, allowing to instrument
a process inside a container.
Device is read from /proc/self/ns/pid, as in the future it's possible that
different pid_ns files may belong to different devices, according
to the discussion between Eric Biederman and Yonghong in 2017 linux plumbers
conference.
Currently bpf_get_current_pid_tgid(), is used to do pid filtering in bcc's
scripts but this helper returns the pid as seen by the root namespace which is
fine when a bcc script is not executed inside a container.
When the process of interest is inside a container, pid filtering will not work
if bpf_get_current_pid_tgid() is used. This helper addresses this limitation
returning the pid as it's seen by the current namespace where the script is
executing.

This helper has the same use cases as bpf_get_current_pid_tgid() as it can be
used to do pid filtering even inside a container.

For example a bcc script using bpf_get_current_pid_tgid() (tools/funccount.py):

u32 pid = bpf_get_current_pid_tgid() >> 32;
if (pid != <pid_arg_passed_in>)
return 0;
Could be modified to use bpf_get_current_pidns_info() as follows:

struct bpf_pidns pidns;
bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
u32 pid = pidns.tgid;
u32 nsid = pidns.nsid;
if ((pid != <pid_arg_passed_in>) && (nsid != <nsid_arg_passed_in>))
return 0;

To find out the name PID namespace id of a process, you could use this command:

$ ps -h -o pidns -p <pid_of_interest>

Or this other command:

$ ls -Li /proc/<pid_of_interest>/ns/pid

Signed-off-by: Carlos Antonio Neira Bustos <cneirabustos@...>
---
include/linux/bpf.h | 1 +
include/uapi/linux/bpf.h | 25 ++++++++++++++++++-
kernel/bpf/core.c | 1 +
kernel/bpf/helpers.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++
kernel/trace/bpf_trace.c | 2 ++
5 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e4d4c1771ab0..4393f8f088cc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -987,6 +987,7 @@ extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
extern const struct bpf_func_proto bpf_spin_lock_proto;
extern const struct bpf_func_proto bpf_spin_unlock_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto;
+extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;

/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 31a27dd337dc..7bf457875c31 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2500,6 +2500,18 @@ union bpf_attr {
* Return
* 0 if iph and th are a valid SYN cookie ACK, or a negative error
* otherwise.
+ *
+ * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 size_of_pidns)
+ * Description
+ * Copies into *pidns* pid, namespace id and tgid as seen by the
+ * current namespace and also device from /proc/self/ns/pid.
+ * *size_of_pidns* must be the size of *pidns*
+ *
+ * This helper is used when pid filtering is needed inside a
+ * container as bpf_get_current_tgid() helper returns always the
+ * pid id as seen by the root namespace.
+ * Return
+ * 0 on success -EINVAL on error.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -2602,7 +2614,8 @@ union bpf_attr {
FN(skb_ecn_set_ce), \
FN(get_listener_sock), \
FN(skc_lookup_tcp), \
- FN(tcp_check_syncookie),
+ FN(tcp_check_syncookie), \
+ FN(get_current_pidns_info),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -3298,4 +3311,14 @@ struct bpf_line_info {
struct bpf_spin_lock {
__u32 val;
};
+
+/* helper bpf_get_current_pidns_info will store the following
+ * data, dev will contain major/minor from /proc/self/pid.
+*/
+struct bpf_pidns_info {
+ __u32 dev;
+ __u32 nsid;
+ __u32 tgid;
+ __u32 pid;
+};
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ace8c22c8b0e..ecbdc72ba459 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2046,6 +2046,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
+const struct bpf_func_proto bpf_get_current_pidns_info __weak;

const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a411fc17d265..2de82d14424d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -18,6 +18,11 @@
#include <linux/sched.h>
#include <linux/uidgid.h>
#include <linux/filter.h>
+#include <linux/pid_namespace.h>
+#include <linux/major.h>
+#include <linux/stat.h>
+#include <linux/namei.h>
+#include <linux/version.h>

/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -317,6 +322,66 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
preempt_enable();
}

+BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, pidns_info, u32,
+ size)
+{
+ const char *pidnspath = "/proc/self/ns/pid";
+ struct pid_namespace *pidns = NULL;
+ DEFINE_SPINLOCK(bpf_spinlock);
+ struct inode *inode;
+ struct kstat ks;
+ struct path kp;
+ pid_t tgid = 0;
+ pid_t pid = 0;
+ int ret;
+
+ if (unlikely(size != sizeof(struct bpf_pidns_info)))
+ goto clear;
+
+ pidns = task_active_pid_ns(current);
+
+ if (unlikely(!pidns))
+ goto clear;
+
+ pidns_info->nsid = pidns->ns.inum;
+ pid = task_pid_nr_ns(current, pidns);
+
+ if (unlikely(!pid))
+ goto clear;
+
+ tgid = task_tgid_nr_ns(current, pidns);
+
+ if (unlikely(!tgid))
+ goto clear;
+
+ pidns_info->tgid = (u32) tgid;
+ pidns_info->pid = (u32) pid;
+
+ spin_lock(&bpf_spinlock);
+ ret = kern_path(pidnspath, 0, &kp);
+ if (ret)
+ goto clear;
+ inode = d_backing_inode(kp.dentry);
+ pidns_info->dev = inode->i_sb->s_dev;
+ spin_unlock(&bpf_spinlock);
+
+ return 0;
+
+ clear:
+ if (pidns_info)
+ memset((void *)pidns, 0, (size_t) size);
+
+ return -EINVAL;
+}
+
+const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
+ .func = bpf_get_current_pidns_info,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+};
+
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d64c00afceb5..2ef0de78d4ec 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -603,6 +603,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
#endif
+ case BPF_FUNC_get_current_pidns_info:
+ return &bpf_get_current_pidns_info_proto;
default:
return NULL;
}
--
2.11.0