Re: [PATCHv5 RFC 1/3] BPF: New helper to obtain namespace data from current task.


neirac
 

Hey Yonghong,
I have changed getname_kernel interface to specify the allocation type,
all previous callers preserve GFP_KERNEL as the allocation type, after this change
getname_kernel could be used with GFP_ATOMIC in ebpf helpers.
If this change goes in, I could complete bpf_get_current_pidns_info helper.
Let me know your thoughts.

From dad6b281a744afb88660137a6a5f27057a72d7d5 Mon Sep 17 00:00:00 2001
From: Carlos <cneirabustos@...>
Date: Mon, 22 Jul 2019 17:50:02 -0400
Subject: [PATCH] [bpf-next 1/1] BPF: getname_kernel specify allocation
type

A parameter to specify the allocation type has been added to getname_kernel, as
currently all allocations done with getname_kernel could sleep, the reason for this
change is that currently ebpf cannot call functions that could sleep and future work
will need to call getname_kernel that currently could sleep on allocation.
This change preserves the GFP_KERNEL allocation type on all existing code that calls
getname_kernel.
---
fs/coredump.c | 2 +-
fs/exec.c | 2 +-
fs/fs_parser.c | 2 +-
fs/namei.c | 23 ++++++++++++++---------
fs/open.c | 2 +-
include/linux/fs.h | 2 +-
init/main.c | 2 +-
kernel/umh.c | 2 +-
8 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index e42e17e55bfd..5c43a32f7ba2 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -684,7 +684,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
* If it doesn't exist, that's fine. If there's some
* other problem, we'll catch it at the filp_open().
*/
- do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
+ do_unlinkat(AT_FDCWD, getname_kernel(cn.corename, GFP_KERNEL));
}

/*
diff --git a/fs/exec.c b/fs/exec.c
index c71cbfe6826a..3811afb37d9a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -882,7 +882,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)

struct file *open_exec(const char *name)
{
- struct filename *filename = getname_kernel(name);
+ struct filename *filename = getname_kernel(name, GFP_KERNEL);
struct file *f = ERR_CAST(filename);

if (!IS_ERR(filename)) {
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 0d388faa25d1..2a8f27cbbf39 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -248,7 +248,7 @@ int fs_lookup_param(struct fs_context *fc,

switch (param->type) {
case fs_value_is_string:
- f = getname_kernel(param->string);
+ f = getname_kernel(param->string, GFP_KERNEL);
if (IS_ERR(f))
return PTR_ERR(f);
put_f = true;
diff --git a/fs/namei.c b/fs/namei.c
index 209c51a5226c..0e0a9710bdfe 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -122,6 +122,11 @@
* PATH_MAX includes the nul terminator --RR.
*/

+/* [July 2019 neirac] getname_kernel added allocation type parameter as
+ * currently all allocations on getname_kernel could sleep, but ebpf
+ * needs non blocking allocations.
+ */
+
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))

struct filename *
@@ -210,12 +215,12 @@ getname(const char __user * filename)
}

struct filename *
-getname_kernel(const char * filename)
+getname_kernel(const char * filename, gfp_t flags)
{
struct filename *result;
int len = strlen(filename) + 1;

- result = __getname();
+ result = kmem_cache_alloc(names_cachep, flags);
if (unlikely(!result))
return ERR_PTR(-ENOMEM);

@@ -225,7 +230,7 @@ getname_kernel(const char * filename)
const size_t size = offsetof(struct filename, iname[1]);
struct filename *tmp;

- tmp = kmalloc(size, GFP_KERNEL);
+ tmp = kmalloc(size, flags);
if (unlikely(!tmp)) {
__putname(result);
return ERR_PTR(-ENOMEM);
@@ -2408,7 +2413,7 @@ struct dentry *kern_path_locked(const char *name, struct path *path)
struct qstr last;
int type;

- filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
+ filename = filename_parentat(AT_FDCWD, getname_kernel(name, GFP_KERNEL), 0, path,
&last, &type);
if (IS_ERR(filename))
return ERR_CAST(filename);
@@ -2429,7 +2434,7 @@ struct dentry *kern_path_locked(const char *name, struct path *path)

int kern_path(const char *name, unsigned int flags, struct path *path)
{
- return filename_lookup(AT_FDCWD, getname_kernel(name),
+ return filename_lookup(AT_FDCWD, getname_kernel(name, GFP_KERNEL),
flags, path, NULL);
}
EXPORT_SYMBOL(kern_path);
@@ -2448,7 +2453,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
{
struct path root = {.mnt = mnt, .dentry = dentry};
/* the first argument of filename_lookup() is ignored with root */
- return filename_lookup(AT_FDCWD, getname_kernel(name),
+ return filename_lookup(AT_FDCWD, getname_kernel(name, GFP_KERNEL),
flags , path, &root);
}
EXPORT_SYMBOL(vfs_path_lookup);
@@ -2749,7 +2754,7 @@ int
kern_path_mountpoint(int dfd, const char *name, struct path *path,
unsigned int flags)
{
- return filename_mountpoint(dfd, getname_kernel(name), path, flags);
+ return filename_mountpoint(dfd, getname_kernel(name, GFP_KERNEL), path, flags);
}
EXPORT_SYMBOL(kern_path_mountpoint);

@@ -3583,7 +3588,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
return ERR_PTR(-ELOOP);

- filename = getname_kernel(name);
+ filename = getname_kernel(name, GFP_KERNEL);
if (IS_ERR(filename))
return ERR_CAST(filename);

@@ -3672,7 +3677,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
struct dentry *kern_path_create(int dfd, const char *pathname,
struct path *path, unsigned int lookup_flags)
{
- return filename_create(dfd, getname_kernel(pathname),
+ return filename_create(dfd, getname_kernel(pathname, GFP_KERNEL),
path, lookup_flags);
}
EXPORT_SYMBOL(kern_path_create);
diff --git a/fs/open.c b/fs/open.c
index b5b80469b93d..cc0e23a605f3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1030,7 +1030,7 @@ struct file *file_open_name(struct filename *name, int flags, umode_t mode)
*/
struct file *filp_open(const char *filename, int flags, umode_t mode)
{
- struct filename *name = getname_kernel(filename);
+ struct filename *name = getname_kernel(filename, GFP_KERNEL);
struct file *file = ERR_CAST(name);

if (!IS_ERR(name)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75f2ed289a3f..826ab644f9bf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2532,7 +2532,7 @@ extern int filp_close(struct file *, fl_owner_t id);

extern struct filename *getname_flags(const char __user *, int, int *);
extern struct filename *getname(const char __user *);
-extern struct filename *getname_kernel(const char *);
+extern struct filename *getname_kernel(const char *, gfp_t flags);
extern void putname(struct filename *name);

extern int finish_open(struct file *file, struct dentry *dentry,
diff --git a/init/main.c b/init/main.c
index ff5803b0841c..29972a5a0ebc 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1046,7 +1046,7 @@ static int run_init_process(const char *init_filename)
{
argv_init[0] = init_filename;
pr_info("Run %s as init process\n", init_filename);
- return do_execve(getname_kernel(init_filename),
+ return do_execve(getname_kernel(init_filename, GFP_KERNEL),
(const char __user *const __user *)argv_init,
(const char __user *const __user *)envp_init);
}
diff --git a/kernel/umh.c b/kernel/umh.c
index 7f255b5a8845..f99e08fc327c 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -109,7 +109,7 @@ static int call_usermodehelper_exec_async(void *data)
if (!retval)
current->flags |= PF_UMH;
} else
- retval = do_execve(getname_kernel(sub_info->path),
+ retval = do_execve(getname_kernel(sub_info->path, GFP_KERNEL),
(const char __user *const __user *)sub_info->argv,
(const char __user *const __user *)sub_info->envp);
out:
--
2.11.0

On Tue, Jul 23, 2019 at 12:55:37PM -0400, Carlos Antonio Neira Bustos wrote:
Yonghong,

I totally agree with this new approach as it isolates the allocation type per
caller.That way we should not have any regressions on existing code, and the
only caller using GFP_ATOMIC getname_kernel should be at this point the ebpf
helper, I'll also change __getname to use the allocation type requested by
getname_kernel flags.

Thanks again for your help.



On Tue, Jul 23, 2019 at 08:56:03AM -0700, Y Song wrote:
On Tue, Jul 23, 2019 at 7:37 AM Carlos Antonio Neira Bustos
<cneirabustos@...> wrote:

Hey Yonghong,
I also needed to replace the call to __getname as allocations on the slab cache
could sleep, that makes getname_kernel the only name cache consumer that does not block.
What do you think ?
Maybe the following alternative approach is better to preserve the
existing behavior.
Change getname_kernel(filename) to getname_kernel(filename, flags) and
the *flags* will
be used for kmem_cache_alloc and kmalloc. Existing behavior won't
change and bpf helper
will call getname_kernel(filename, GFP_ATOMIC). More code changes but
cleaner and not
unnecessarily overusing kernel atomic memory region.




From 639233cc05e96a81054b049d16d1b9193ae667e9 Mon Sep 17 00:00:00 2001
From: Carlos <cneirabustos@...>
Date: Mon, 22 Jul 2019 17:50:02 -0400
Subject: [PATCH] [bpf-next 1/1] BPF: getname_kernel don't block on
allocation.

This patch calls directly kmem_cache_alloc to specify the allocation type as GFP_ATOMIC,
the reason is that currently ebpf cannot call functions that could sleep and future work
will need to call kern_path that currently could sleep on allocation.
---
fs/namei.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 209c51a5226c..a4269ca52503 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -122,6 +122,10 @@
* PATH_MAX includes the nul terminator --RR.
*/

+/* [July 2019 neirac] getname_kernel changed allocation type to GFP_ATOMIC,
+ * as ebpf needs to use kern_path.
+ */
+
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))

struct filename *
@@ -215,7 +219,7 @@ getname_kernel(const char * filename)
struct filename *result;
int len = strlen(filename) + 1;

- result = __getname();
+ result = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
if (unlikely(!result))
return ERR_PTR(-ENOMEM);

@@ -225,7 +229,7 @@ getname_kernel(const char * filename)
const size_t size = offsetof(struct filename, iname[1]);
struct filename *tmp;

- tmp = kmalloc(size, GFP_KERNEL);
+ tmp = kmalloc(size, GFP_ATOMIC);
if (unlikely(!tmp)) {
__putname(result);
return ERR_PTR(-ENOMEM);
--
2.11.0



On Fri, Jul 19, 2019 at 09:06:54AM -0700, Y Song wrote:
On Fri, Jul 19, 2019 at 6:02 AM cnb <cneirabustos@...> wrote:

kern_path is the one, should I move this and dependencies to a new file called bfp_namei.c, in there I'll change it to use GFP_ATOMIC or create a new function to replace kern_path. What do you think?
I think adding GFP_ATOMIC to function getname_kernel() is reasonable.

const size_t size = offsetof(struct filename, iname[1]);
struct filename *tmp;

tmp = kmalloc(size, GFP_KERNEL);

/* fs/open.c */
struct audit_names;
struct filename {
const char *name; /* pointer to actual string */
const __user char *uptr; /* original userland pointer */
int refcnt;
struct audit_names *aname;
const char iname[];
};

The size is pretty small. on x64, it should be 25 bytes. If the system
cannot honor
this 25 bytes in non-blocking mode, it is probably already in stress,
bpf subsystem itself may not work reliably as it uses some GFP_ATOMIC
as well for some map (e.g., update) helpers.

So let us have this patch (adding GFP_ATOMIC) in the first series. We can
think of alternatives (e.g., separate functions) if anybody objects.


El jue., 18 de jul. de 2019 22:10, Y Song <ys114321@...> escribió:

On Thu, Jul 18, 2019 at 6:21 PM carlos antonio neira bustos
<cneirabustos@...> wrote:

Hi,
Yes, I'm still interested as I need this capability at $WORK, but haven't had the time to re-write the dentry functions we need, as currently they could sleep.
I could resume work on this next monday when I get back from vacations.
I really appreciate your help and guidance on this.
Great. Which dentry function are you referring to? dput? kern_path
cannot be used as it uses
kmalloc without GFP_ATOMIC. See https://github.com/iovisor/bcc/issues/1329.


Bests


El jue., 18 de jul. de 2019 20:29, Y Song <ys114321@...> escribió:

Hi, Carlos,

Are you still interested in upstreaming this patch? Looks like there
still a desire
to make bcc work inside the containers.

The bpf-next will open next week. If you would like, could you submit again?
I will review the patch on bpf-next as well to make sure we made
forward progress.

Please let me know. Thanks!

Yonghong

On Tue, Apr 16, 2019 at 6:31 PM Carlos Antonio Neira Bustos
<cneirabustos@...> wrote:

As a bpf program cannot sleep, I needed to add a spinlock to kern_path, as
it calls getname_kernel() which may sleep.
The inode is accessed directly, as we are just interested in the inode's s_dev.
Let me know if this approach is the correct one.
-------------------------------------------------------------------------------
From 35b7bfcbf6524ec807f107302209a0fb07614cc8 Mon Sep 17 00:00:00 2001
From: Carlos <cneirabustos@...>
Date: Tue, 16 Apr 2019 17:10:46 -0400
Subject: [PATCH] [PATCH bpf-next 1/3] BPF: New helper to obtain namespace data
from current task

This helper obtains the active namespace from current and returns pid, tgid,
device and namespace id as seen from that namespace, allowing to instrument
a process inside a container.
Device is read from /proc/self/ns/pid, as in the future it's possible that
different pid_ns files may belong to different devices, according
to the discussion between Eric Biederman and Yonghong in 2017 linux plumbers
conference.
Currently bpf_get_current_pid_tgid(), is used to do pid filtering in bcc's
scripts but this helper returns the pid as seen by the root namespace which is
fine when a bcc script is not executed inside a container.
When the process of interest is inside a container, pid filtering will not work
if bpf_get_current_pid_tgid() is used. This helper addresses this limitation
returning the pid as it's seen by the current namespace where the script is
executing.

This helper has the same use cases as bpf_get_current_pid_tgid() as it can be
used to do pid filtering even inside a container.

For example a bcc script using bpf_get_current_pid_tgid() (tools/funccount.py):

u32 pid = bpf_get_current_pid_tgid() >> 32;
if (pid != <pid_arg_passed_in>)
return 0;
Could be modified to use bpf_get_current_pidns_info() as follows:

struct bpf_pidns pidns;
bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
u32 pid = pidns.tgid;
u32 nsid = pidns.nsid;
if ((pid != <pid_arg_passed_in>) && (nsid != <nsid_arg_passed_in>))
return 0;

To find out the name PID namespace id of a process, you could use this command:

$ ps -h -o pidns -p <pid_of_interest>

Or this other command:

$ ls -Li /proc/<pid_of_interest>/ns/pid

Signed-off-by: Carlos Antonio Neira Bustos <cneirabustos@...>
---
include/linux/bpf.h | 1 +
include/uapi/linux/bpf.h | 25 ++++++++++++++++++-
kernel/bpf/core.c | 1 +
kernel/bpf/helpers.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++
kernel/trace/bpf_trace.c | 2 ++
5 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e4d4c1771ab0..4393f8f088cc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -987,6 +987,7 @@ extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
extern const struct bpf_func_proto bpf_spin_lock_proto;
extern const struct bpf_func_proto bpf_spin_unlock_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto;
+extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;

/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 31a27dd337dc..7bf457875c31 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2500,6 +2500,18 @@ union bpf_attr {
* Return
* 0 if iph and th are a valid SYN cookie ACK, or a negative error
* otherwise.
+ *
+ * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 size_of_pidns)
+ * Description
+ * Copies into *pidns* pid, namespace id and tgid as seen by the
+ * current namespace and also device from /proc/self/ns/pid.
+ * *size_of_pidns* must be the size of *pidns*
+ *
+ * This helper is used when pid filtering is needed inside a
+ * container as bpf_get_current_tgid() helper returns always the
+ * pid id as seen by the root namespace.
+ * Return
+ * 0 on success -EINVAL on error.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -2602,7 +2614,8 @@ union bpf_attr {
FN(skb_ecn_set_ce), \
FN(get_listener_sock), \
FN(skc_lookup_tcp), \
- FN(tcp_check_syncookie),
+ FN(tcp_check_syncookie), \
+ FN(get_current_pidns_info),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -3298,4 +3311,14 @@ struct bpf_line_info {
struct bpf_spin_lock {
__u32 val;
};
+
+/* helper bpf_get_current_pidns_info will store the following
+ * data, dev will contain major/minor from /proc/self/pid.
+*/
+struct bpf_pidns_info {
+ __u32 dev;
+ __u32 nsid;
+ __u32 tgid;
+ __u32 pid;
+};
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ace8c22c8b0e..ecbdc72ba459 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2046,6 +2046,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
+const struct bpf_func_proto bpf_get_current_pidns_info __weak;

const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a411fc17d265..2de82d14424d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -18,6 +18,11 @@
#include <linux/sched.h>
#include <linux/uidgid.h>
#include <linux/filter.h>
+#include <linux/pid_namespace.h>
+#include <linux/major.h>
+#include <linux/stat.h>
+#include <linux/namei.h>
+#include <linux/version.h>

/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -317,6 +322,66 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
preempt_enable();
}

+BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, pidns_info, u32,
+ size)
+{
+ const char *pidnspath = "/proc/self/ns/pid";
+ struct pid_namespace *pidns = NULL;
+ DEFINE_SPINLOCK(bpf_spinlock);
+ struct inode *inode;
+ struct kstat ks;
+ struct path kp;
+ pid_t tgid = 0;
+ pid_t pid = 0;
+ int ret;
+
+ if (unlikely(size != sizeof(struct bpf_pidns_info)))
+ goto clear;
+
+ pidns = task_active_pid_ns(current);
+
+ if (unlikely(!pidns))
+ goto clear;
+
+ pidns_info->nsid = pidns->ns.inum;
+ pid = task_pid_nr_ns(current, pidns);
+
+ if (unlikely(!pid))
+ goto clear;
+
+ tgid = task_tgid_nr_ns(current, pidns);
+
+ if (unlikely(!tgid))
+ goto clear;
+
+ pidns_info->tgid = (u32) tgid;
+ pidns_info->pid = (u32) pid;
+
+ spin_lock(&bpf_spinlock);
+ ret = kern_path(pidnspath, 0, &kp);
+ if (ret)
+ goto clear;
+ inode = d_backing_inode(kp.dentry);
+ pidns_info->dev = inode->i_sb->s_dev;
+ spin_unlock(&bpf_spinlock);
+
+ return 0;
+
+ clear:
+ if (pidns_info)
+ memset((void *)pidns, 0, (size_t) size);
+
+ return -EINVAL;
+}
+
+const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
+ .func = bpf_get_current_pidns_info,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+};
+
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d64c00afceb5..2ef0de78d4ec 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -603,6 +603,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
#endif
+ case BPF_FUNC_get_current_pidns_info:
+ return &bpf_get_current_pidns_info_proto;
default:
return NULL;
}
--
2.11.0

Join iovisor-dev@lists.iovisor.org to automatically receive all group messages.