Re: [PATCHv5 RFC 1/3] BPF: New helper to obtain namespace data from current task.


neirac
 

Hi,
Yes, I'm still interested as I need this capability at $WORK, but haven't had the time to re-write the dentry functions we need, as currently they could sleep.
I could resume work on this next monday when I get back from vacations.
I really appreciate your help and guidance on this.

Bests


El jue., 18 de jul. de 2019 20:29, Y Song <ys114321@...> escribió:
Hi, Carlos,

Are you still interested in upstreaming this patch? Looks like there
still a desire
to make bcc work inside the containers.

The bpf-next will open next week. If you would like, could you submit again?
I will review the patch on bpf-next as well to make sure we made
forward progress.

Please let me know. Thanks!

Yonghong

On Tue, Apr 16, 2019 at 6:31 PM Carlos Antonio Neira Bustos
<cneirabustos@...> wrote:
>
> As a bpf program cannot sleep, I needed to add a spinlock to kern_path, as
> it calls getname_kernel() which may sleep.
> The inode is accessed directly, as we are just interested in the inode's s_dev.
> Let me know if this approach is the correct one.
> -------------------------------------------------------------------------------
> From 35b7bfcbf6524ec807f107302209a0fb07614cc8 Mon Sep 17 00:00:00 2001
> From: Carlos <cneirabustos@...>
> Date: Tue, 16 Apr 2019 17:10:46 -0400
> Subject: [PATCH] [PATCH bpf-next 1/3] BPF: New helper to obtain namespace data
>   from current task
>
> This helper obtains the active namespace from current and returns pid, tgid,
> device and namespace id as seen from that namespace, allowing to instrument
> a process inside a container.
> Device is read from /proc/self/ns/pid, as in the future it's possible that
> different pid_ns files may belong to different devices, according
> to the discussion between Eric Biederman and Yonghong in 2017 linux plumbers
> conference.
> Currently bpf_get_current_pid_tgid(), is used to do pid filtering in bcc's
> scripts but this helper returns the pid as seen by the root namespace which is
> fine when a bcc script is not executed inside a container.
> When the process of interest is inside a container, pid filtering will not work
> if bpf_get_current_pid_tgid() is used. This helper addresses this limitation
> returning the pid as it's seen by the current namespace where the script is
> executing.
>
> This helper has the same use cases as bpf_get_current_pid_tgid() as it can be
> used to do pid filtering even inside a container.
>
> For example a bcc script using bpf_get_current_pid_tgid() (tools/funccount.py):
>
>         u32 pid = bpf_get_current_pid_tgid() >> 32;
>         if (pid != <pid_arg_passed_in>)
>                 return 0;
> Could be modified to use bpf_get_current_pidns_info() as follows:
>
>         struct bpf_pidns pidns;
>         bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
>         u32 pid = pidns.tgid;
>         u32 nsid = pidns.nsid;
>         if ((pid != <pid_arg_passed_in>) && (nsid != <nsid_arg_passed_in>))
>                 return 0;
>
> To find out the name PID namespace id of a process, you could use this command:
>
> $ ps -h -o pidns -p <pid_of_interest>
>
> Or this other command:
>
> $ ls -Li /proc/<pid_of_interest>/ns/pid
>
> Signed-off-by: Carlos Antonio Neira Bustos <cneirabustos@...>
> ---
>  include/linux/bpf.h      |  1 +
>  include/uapi/linux/bpf.h | 25 ++++++++++++++++++-
>  kernel/bpf/core.c        |  1 +
>  kernel/bpf/helpers.c     | 65 ++++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/trace/bpf_trace.c |  2 ++
>  5 files changed, 93 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index e4d4c1771ab0..4393f8f088cc 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -987,6 +987,7 @@ extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
>  extern const struct bpf_func_proto bpf_spin_lock_proto;
>  extern const struct bpf_func_proto bpf_spin_unlock_proto;
>  extern const struct bpf_func_proto bpf_get_local_storage_proto;
> +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
>
>  /* Shared helpers among cBPF and eBPF. */
>  void bpf_user_rnd_init_once(void);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 31a27dd337dc..7bf457875c31 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2500,6 +2500,18 @@ union bpf_attr {
>   *     Return
>   *             0 if iph and th are a valid SYN cookie ACK, or a negative error
>   *             otherwise.
> + *
> + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 size_of_pidns)
> + *     Description
> + *             Copies into *pidns* pid, namespace id and tgid as seen by the
> + *             current namespace and also device from /proc/self/ns/pid.
> + *             *size_of_pidns* must be the size of *pidns*
> + *
> + *             This helper is used when pid filtering is needed inside a
> + *             container as bpf_get_current_tgid() helper returns always the
> + *             pid id as seen by the root namespace.
> + *     Return
> + *             0 on success -EINVAL on error.
>   */
>  #define __BPF_FUNC_MAPPER(FN)          \
>         FN(unspec),                     \
> @@ -2602,7 +2614,8 @@ union bpf_attr {
>         FN(skb_ecn_set_ce),             \
>         FN(get_listener_sock),          \
>         FN(skc_lookup_tcp),             \
> -       FN(tcp_check_syncookie),
> +       FN(tcp_check_syncookie),        \
> +       FN(get_current_pidns_info),
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -3298,4 +3311,14 @@ struct bpf_line_info {
>  struct bpf_spin_lock {
>         __u32   val;
>  };
> +
> +/* helper bpf_get_current_pidns_info will store the following
> + * data, dev will contain major/minor from /proc/self/pid.
> +*/
> +struct bpf_pidns_info {
> +       __u32 dev;
> +       __u32 nsid;
> +       __u32 tgid;
> +       __u32 pid;
> +};
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index ace8c22c8b0e..ecbdc72ba459 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2046,6 +2046,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
>  const struct bpf_func_proto bpf_get_current_comm_proto __weak;
>  const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
>  const struct bpf_func_proto bpf_get_local_storage_proto __weak;
> +const struct bpf_func_proto bpf_get_current_pidns_info __weak;
>
>  const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
>  {
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index a411fc17d265..2de82d14424d 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -18,6 +18,11 @@
>  #include <linux/sched.h>
>  #include <linux/uidgid.h>
>  #include <linux/filter.h>
> +#include <linux/pid_namespace.h>
> +#include <linux/major.h>
> +#include <linux/stat.h>
> +#include <linux/namei.h>
> +#include <linux/version.h>
>
>  /* If kernel subsystem is allowing eBPF programs to call this function,
>   * inside its own verifier_ops->get_func_proto() callback it should return
> @@ -317,6 +322,66 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
>         preempt_enable();
>  }
>
> +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, pidns_info, u32,
> +        size)
> +{
> +       const char *pidnspath = "/proc/self/ns/pid";
> +       struct pid_namespace *pidns = NULL;
> +       DEFINE_SPINLOCK(bpf_spinlock);
> +       struct inode *inode;
> +       struct kstat ks;
> +       struct path kp;
> +       pid_t tgid = 0;
> +       pid_t pid = 0;
> +       int ret;
> +
> +       if (unlikely(size != sizeof(struct bpf_pidns_info)))
> +               goto clear;
> +
> +       pidns = task_active_pid_ns(current);
> +
> +       if (unlikely(!pidns))
> +               goto clear;
> +
> +       pidns_info->nsid =  pidns->ns.inum;
> +       pid = task_pid_nr_ns(current, pidns);
> +
> +       if (unlikely(!pid))
> +               goto clear;
> +
> +       tgid = task_tgid_nr_ns(current, pidns);
> +
> +       if (unlikely(!tgid))
> +               goto clear;
> +
> +       pidns_info->tgid = (u32) tgid;
> +       pidns_info->pid = (u32) pid;
> +
> +       spin_lock(&bpf_spinlock);
> +        ret = kern_path(pidnspath, 0, &kp);
> +       if (ret)
> +               goto clear;
> +       inode = d_backing_inode(kp.dentry);
> +       pidns_info->dev = inode->i_sb->s_dev;
> +       spin_unlock(&bpf_spinlock);
> +
> +       return 0;
> +
> +       clear:
> +       if (pidns_info)
> +               memset((void *)pidns, 0, (size_t) size);
> +
> +       return -EINVAL;
> +}
> +
> +const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
> +       .func   = bpf_get_current_pidns_info,
> +       .gpl_only       = false,
> +       .ret_type       = RET_INTEGER,
> +       .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
> +       .arg2_type      = ARG_CONST_SIZE,
> +};
> +
>  #ifdef CONFIG_CGROUPS
>  BPF_CALL_0(bpf_get_current_cgroup_id)
>  {
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index d64c00afceb5..2ef0de78d4ec 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -603,6 +603,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>         case BPF_FUNC_get_current_cgroup_id:
>                 return &bpf_get_current_cgroup_id_proto;
>  #endif
> +       case BPF_FUNC_get_current_pidns_info:
> +               return &bpf_get_current_pidns_info_proto;
>         default:
>                 return NULL;
>         }
> --
> 2.11.0
>

Join {iovisor-dev@lists.iovisor.org to automatically receive all group messages.