Re: Print slow commit operations in nfsslower


TATSUKAWA KOSUKE(立川 江介)
 

Hi,

On Tue, Nov 15, 2022 at 2:04 AM TATSUKAWA KOSUKE(立川 江介)
<tatsu-ab1@...> wrote:

Hi,

I was using nfsslower in bcc tools and noticed that it doesn't print out
slow commit operations, which sometimes cause long delays in NFS writes.
I tried adding code to trace slow commit operations and created the
attached patch.
The concept and code looks okay to me. One thing is you could use
RAW_TRACEPOINT_PROBE macro to be consistent with some other
bcc tools raw tracepoint usages.

Please submit a pull request.
thank you for the reply. I'll change the code to use
RAW_TRACEPOINT_PROBE macro used in other bcc tools.

While testing on some old environments which doesn't support raw
tracepoints, I noticed that linux/nfs_fs.h header file causes bcc to
print the error message "use of undeclared identifier 'KBUILD_MODNAME'".
This problem seems to affect other bcc tools, which #define this
symbol to work around it.

I'll submit a pull request once I've done a little more testing.

Best regards.


Please let me hear your opinions.

NFS commits are quite slow and are not performed often, so the patch
shouldn't incur visible overhead.

Below is a sample output from the modified tool when writing a large
file using dd command. Most commits are issued from kworker threads,
so the commit output may be mostly dropped when filtered by PID.

TIME COMM PID T BYTES OFF_KB LAT(ms) FILENAME
...
16:46:11 dd 1212 W 1048576 651264 46.58 testfile
16:46:11 dd 1212 W 1048576 653312 54.41 testfile
16:46:11 dd 1212 W 1048576 654336 18.96 testfile
16:46:11 dd 1212 W 1048576 655360 49.05 testfile
16:46:11 dd 1212 W 1048576 657408 82.96 testfile
16:46:11 dd 1212 W 1048576 659456 109.25 testfile
16:46:12 dd 1212 W 1048576 660480 163.55 testfile
16:46:12 dd 1212 W 1048576 662528 205.44 testfile
16:46:13 dd 1212 W 1048576 663552 751.02 testfile
16:46:37 kworker/u8:5 1207 C 0 0 27449.05 testfile
16:46:37 kworker/u8:5 1207 C 0 0 26725.16 testfile
16:46:37 kworker/u8:5 1207 C 0 0 27592.04 testfile
16:46:37 kworker/u8:4 1206 C 0 0 22188.75 testfile
16:46:37 kworker/u8:4 1206 C 0 0 26092.59 testfile
16:46:37 kworker/u8:4 1206 C 0 0 27268.90 testfile
16:46:37 kworker/u8:4 1206 C 0 0 22303.24 testfile
16:46:37 kworker/u8:4 1206 C 0 0 27081.34 testfile
16:46:37 dd 1212 W 1048576 664576 24337.80 testfile
16:46:38 dd 1212 W 1048576 958464 61.77 testfile
16:46:38 dd 1212 W 1048576 960512 56.60 testfile
16:46:38 dd 1212 W 1048576 963584 55.75 testfile
16:46:38 dd 1212 W 1048576 965632 54.84 testfile
...

Best regards.
---
Kosuke TATSUKAWA | 1st Platform Software Division
| NEC Solution Innovators
| tatsu-ab1@...

------------------------------------------------------------------------
diff -rup a/tools/nfsslower.py b/tools/nfsslower.py
--- a/tools/nfsslower 2022-11-14 12:39:23.677073240 +0900
+++ b/tools/nfsslower 2022-11-14 14:44:49.960073240 +0900
@@ -9,6 +9,8 @@
# This script traces some common NFS operations: read, write, opens and
# getattr. It measures the time spent in these operations, and prints details
# for each that exceeded a threshold.
+# The script also traces commit operations, which is specific to nfs and could
+# be pretty slow.
#
# WARNING: This adds low-overhead instrumentation to these NFS operations,
# including reads and writes from the file system cache. Such reads and writes
@@ -25,6 +27,8 @@
# Currently there aren't any tracepoints available for nfs_read_file,
# nfs_write_file and nfs_open_file, nfs_getattr does have entry and exit
# tracepoints but we chose to use kprobes for consistency
+# Raw tracepoints are used to trace nfs:nfs_initiate_commit and
+# nfs:nfs_commit_done.
#
# 31-Aug-2017 Samuel Nair created this. Should work with NFSv{3,4}

@@ -41,8 +45,8 @@ examples = """
./nfsslower -p 121 # trace pid 121 only
"""
parser = argparse.ArgumentParser(
- description="""Trace READ, WRITE, OPEN \
-and GETATTR NFS calls slower than a threshold,\
+ description="""Trace READ, WRITE, OPEN, GETATTR \
+and COMMIT NFS calls slower than a threshold,\
supports NFSv{3,4}""",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
@@ -66,11 +70,13 @@ bpf_text = """
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/dcache.h>
+#include <linux/nfs_fs.h>

#define TRACE_READ 0
#define TRACE_WRITE 1
#define TRACE_OPEN 2
#define TRACE_GETATTR 3
+#define TRACE_COMMIT 4

struct val_t {
u64 ts;
@@ -79,6 +85,12 @@ struct val_t {
struct dentry *d;
};

+struct commit_t {
+ u64 ts;
+ u64 offset;
+ u64 count;
+};
+
struct data_t {
// XXX: switch some to u32's when supported
u64 ts_us;
@@ -93,6 +105,7 @@ struct data_t {

BPF_HASH(entryinfo, u64, struct val_t);
BPF_PERF_OUTPUT(events);
+BPF_HASH(commitinfo, u64, struct commit_t);

int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb,
struct iov_iter *data)
@@ -227,6 +240,56 @@ int trace_getattr_return(struct pt_regs
return trace_exit(ctx, TRACE_GETATTR);
}

+int raw_nfs_initiate_commit(struct bpf_raw_tracepoint_args *ctx) {
+ u64 key = (u64)ctx->args[0];
+ struct nfs_commit_data *cd = (struct nfs_commit_data *)key;
+ struct commit_t c = { 0 };
+
+ c.ts = bpf_ktime_get_ns();
+ bpf_probe_read_kernel(&c.offset, sizeof(cd->args.offset),
+ &cd->args.offset);
+ bpf_probe_read_kernel(&c.count, sizeof(cd->args.count), &cd->args.count);
+ commitinfo.update(&key, &c);
+ return 0;
+}
+
+int raw_nfs_commit_done(struct bpf_raw_tracepoint_args *ctx) {
+ u64 key = (u64)ctx->args[1];
+ struct commit_t *cp = commitinfo.lookup(&key);
+
+ if (cp) {
+ struct nfs_commit_data *cd = (struct nfs_commit_data *)key;
+ struct nfs_open_context *p;
+ struct dentry *de;
+ struct qstr qs;
+ u64 ts = bpf_ktime_get_ns();
+ u64 delta_us = (ts - cp->ts) / 1000;
+ u32 pid = bpf_get_current_pid_tgid() >> 32;
+ struct data_t data = {.type = TRACE_COMMIT, .offset = cp->offset,
+ .size = cp->count, .ts_us = ts/1000, .delta_us = delta_us,
+ .pid = pid};
+
+ commitinfo.delete(&key);
+ bpf_get_current_comm(&data.task, sizeof(data.task));
+
+ if(FILTER_PID)
+ return 0;
+
+ if (FILTER_US)
+ return 0;
+
+ bpf_probe_read_kernel(&p, sizeof(p), &cd->context);
+ bpf_probe_read_kernel(&de, sizeof(de), &p->dentry);
+ bpf_probe_read_kernel(&qs, sizeof(qs), &de->d_name);
+ if (qs.len) {
+ bpf_probe_read_kernel(&data.file, sizeof(data.file),
+ (void *)qs.name);
+ events.perf_submit(ctx, &data, sizeof(data));
+ }
+ }
+ return 0;
+}
+
"""
if min_ms == 0:
bpf_text = bpf_text.replace('FILTER_US', '0')
@@ -253,6 +316,8 @@ def print_event(cpu, data, size):
type = 'O'
elif event.type == 3:
type = 'G'
+ elif event.type == 4:
+ type = 'C'

if(csv):
print("%d,%s,%d,%s,%d,%d,%d,%s" % (
@@ -273,7 +338,18 @@ def print_event(cpu, data, size):
# Currently specifically works for NFSv4, the other kprobes are generic
# so it should work with earlier NFS versions

-b = BPF(text=bpf_text)
+# The following warning is shown on kernels after linux-5.18 when using bcc.
+# Add compile option to silence it.
+# In file included from /virtual/main.c:7:
+# In file included from include/linux/nfs_fs.h:31:
+# In file included from include/linux/sunrpc/auth.h:13:
+# In file included from include/linux/sunrpc/sched.h:19:
+# include/linux/sunrpc/xdr.h:751:10: warning: result of comparison of constant 4611686018427387903 with expression of type '__u32' (aka 'unsigned int') is always false [-Wtautological-constant-out-of-range-compare]
+# if (len > SIZE_MAX / sizeof(*p))
+# ~~~ ^ ~~~~~~~~~~~~~~~~~~~~~
+# 1 warning generated.
+b = BPF(text=bpf_text,
+ cflags=["-Wno-tautological-constant-out-of-range-compare"])
b.attach_kprobe(event="nfs_file_read", fn_name="trace_rw_entry")
b.attach_kprobe(event="nfs_file_write", fn_name="trace_rw_entry")
b.attach_kprobe(event="nfs4_file_open", fn_name="trace_file_open_entry")
@@ -286,6 +362,9 @@ b.attach_kretprobe(event="nfs4_file_open
b.attach_kretprobe(event="nfs_file_open", fn_name="trace_file_open_return")
b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return")

+b.attach_raw_tracepoint("nfs_initiate_commit", "raw_nfs_initiate_commit")
+b.attach_raw_tracepoint("nfs_commit_done", "raw_nfs_commit_done")
+
if(csv):
print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")
else:

---
Kosuke TATSUKAWA | 1st Platform Software Division
| NEC Solution Innovators
| tatsu-ab1@...

Join {iovisor-dev@lists.iovisor.org to automatically receive all group messages.