Print slow commit operations in nfsslower


TATSUKAWA KOSUKE(立川 江介)
 

Hi,

I was using nfsslower in bcc tools and noticed that it doesn't print out
slow commit operations, which sometimes cause long delays in NFS writes.
I tried adding code to trace slow commit operations and created the
attached patch.

Please let me hear your opinions.

NFS commits are quite slow and are not performed often, so the patch
shouldn't incur visible overhead.

Below is a sample output from the modified tool when writing a large
file using dd command. Most commits are issued from kworker threads,
so the commit output may be mostly dropped when filtered by PID.

TIME COMM PID T BYTES OFF_KB LAT(ms) FILENAME
...
16:46:11 dd 1212 W 1048576 651264 46.58 testfile
16:46:11 dd 1212 W 1048576 653312 54.41 testfile
16:46:11 dd 1212 W 1048576 654336 18.96 testfile
16:46:11 dd 1212 W 1048576 655360 49.05 testfile
16:46:11 dd 1212 W 1048576 657408 82.96 testfile
16:46:11 dd 1212 W 1048576 659456 109.25 testfile
16:46:12 dd 1212 W 1048576 660480 163.55 testfile
16:46:12 dd 1212 W 1048576 662528 205.44 testfile
16:46:13 dd 1212 W 1048576 663552 751.02 testfile
16:46:37 kworker/u8:5 1207 C 0 0 27449.05 testfile
16:46:37 kworker/u8:5 1207 C 0 0 26725.16 testfile
16:46:37 kworker/u8:5 1207 C 0 0 27592.04 testfile
16:46:37 kworker/u8:4 1206 C 0 0 22188.75 testfile
16:46:37 kworker/u8:4 1206 C 0 0 26092.59 testfile
16:46:37 kworker/u8:4 1206 C 0 0 27268.90 testfile
16:46:37 kworker/u8:4 1206 C 0 0 22303.24 testfile
16:46:37 kworker/u8:4 1206 C 0 0 27081.34 testfile
16:46:37 dd 1212 W 1048576 664576 24337.80 testfile
16:46:38 dd 1212 W 1048576 958464 61.77 testfile
16:46:38 dd 1212 W 1048576 960512 56.60 testfile
16:46:38 dd 1212 W 1048576 963584 55.75 testfile
16:46:38 dd 1212 W 1048576 965632 54.84 testfile
...

Best regards.
---
Kosuke TATSUKAWA | 1st Platform Software Division
| NEC Solution Innovators
| tatsu-ab1@...

------------------------------------------------------------------------
diff -rup a/tools/nfsslower.py b/tools/nfsslower.py
--- a/tools/nfsslower 2022-11-14 12:39:23.677073240 +0900
+++ b/tools/nfsslower 2022-11-14 14:44:49.960073240 +0900
@@ -9,6 +9,8 @@
# This script traces some common NFS operations: read, write, opens and
# getattr. It measures the time spent in these operations, and prints details
# for each that exceeded a threshold.
+# The script also traces commit operations, which is specific to nfs and could
+# be pretty slow.
#
# WARNING: This adds low-overhead instrumentation to these NFS operations,
# including reads and writes from the file system cache. Such reads and writes
@@ -25,6 +27,8 @@
# Currently there aren't any tracepoints available for nfs_read_file,
# nfs_write_file and nfs_open_file, nfs_getattr does have entry and exit
# tracepoints but we chose to use kprobes for consistency
+# Raw tracepoints are used to trace nfs:nfs_initiate_commit and
+# nfs:nfs_commit_done.
#
# 31-Aug-2017 Samuel Nair created this. Should work with NFSv{3,4}

@@ -41,8 +45,8 @@ examples = """
./nfsslower -p 121 # trace pid 121 only
"""
parser = argparse.ArgumentParser(
- description="""Trace READ, WRITE, OPEN \
-and GETATTR NFS calls slower than a threshold,\
+ description="""Trace READ, WRITE, OPEN, GETATTR \
+and COMMIT NFS calls slower than a threshold,\
supports NFSv{3,4}""",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
@@ -66,11 +70,13 @@ bpf_text = """
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/dcache.h>
+#include <linux/nfs_fs.h>

#define TRACE_READ 0
#define TRACE_WRITE 1
#define TRACE_OPEN 2
#define TRACE_GETATTR 3
+#define TRACE_COMMIT 4

struct val_t {
u64 ts;
@@ -79,6 +85,12 @@ struct val_t {
struct dentry *d;
};

+struct commit_t {
+ u64 ts;
+ u64 offset;
+ u64 count;
+};
+
struct data_t {
// XXX: switch some to u32's when supported
u64 ts_us;
@@ -93,6 +105,7 @@ struct data_t {

BPF_HASH(entryinfo, u64, struct val_t);
BPF_PERF_OUTPUT(events);
+BPF_HASH(commitinfo, u64, struct commit_t);

int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb,
struct iov_iter *data)
@@ -227,6 +240,56 @@ int trace_getattr_return(struct pt_regs
return trace_exit(ctx, TRACE_GETATTR);
}

+int raw_nfs_initiate_commit(struct bpf_raw_tracepoint_args *ctx) {
+ u64 key = (u64)ctx->args[0];
+ struct nfs_commit_data *cd = (struct nfs_commit_data *)key;
+ struct commit_t c = { 0 };
+
+ c.ts = bpf_ktime_get_ns();
+ bpf_probe_read_kernel(&c.offset, sizeof(cd->args.offset),
+ &cd->args.offset);
+ bpf_probe_read_kernel(&c.count, sizeof(cd->args.count), &cd->args.count);
+ commitinfo.update(&key, &c);
+ return 0;
+}
+
+int raw_nfs_commit_done(struct bpf_raw_tracepoint_args *ctx) {
+ u64 key = (u64)ctx->args[1];
+ struct commit_t *cp = commitinfo.lookup(&key);
+
+ if (cp) {
+ struct nfs_commit_data *cd = (struct nfs_commit_data *)key;
+ struct nfs_open_context *p;
+ struct dentry *de;
+ struct qstr qs;
+ u64 ts = bpf_ktime_get_ns();
+ u64 delta_us = (ts - cp->ts) / 1000;
+ u32 pid = bpf_get_current_pid_tgid() >> 32;
+ struct data_t data = {.type = TRACE_COMMIT, .offset = cp->offset,
+ .size = cp->count, .ts_us = ts/1000, .delta_us = delta_us,
+ .pid = pid};
+
+ commitinfo.delete(&key);
+ bpf_get_current_comm(&data.task, sizeof(data.task));
+
+ if(FILTER_PID)
+ return 0;
+
+ if (FILTER_US)
+ return 0;
+
+ bpf_probe_read_kernel(&p, sizeof(p), &cd->context);
+ bpf_probe_read_kernel(&de, sizeof(de), &p->dentry);
+ bpf_probe_read_kernel(&qs, sizeof(qs), &de->d_name);
+ if (qs.len) {
+ bpf_probe_read_kernel(&data.file, sizeof(data.file),
+ (void *)qs.name);
+ events.perf_submit(ctx, &data, sizeof(data));
+ }
+ }
+ return 0;
+}
+
"""
if min_ms == 0:
bpf_text = bpf_text.replace('FILTER_US', '0')
@@ -253,6 +316,8 @@ def print_event(cpu, data, size):
type = 'O'
elif event.type == 3:
type = 'G'
+ elif event.type == 4:
+ type = 'C'

if(csv):
print("%d,%s,%d,%s,%d,%d,%d,%s" % (
@@ -273,7 +338,18 @@ def print_event(cpu, data, size):
# Currently specifically works for NFSv4, the other kprobes are generic
# so it should work with earlier NFS versions

-b = BPF(text=bpf_text)
+# The following warning is shown on kernels after linux-5.18 when using bcc.
+# Add compile option to silence it.
+# In file included from /virtual/main.c:7:
+# In file included from include/linux/nfs_fs.h:31:
+# In file included from include/linux/sunrpc/auth.h:13:
+# In file included from include/linux/sunrpc/sched.h:19:
+# include/linux/sunrpc/xdr.h:751:10: warning: result of comparison of constant 4611686018427387903 with expression of type '__u32' (aka 'unsigned int') is always false [-Wtautological-constant-out-of-range-compare]
+# if (len > SIZE_MAX / sizeof(*p))
+# ~~~ ^ ~~~~~~~~~~~~~~~~~~~~~
+# 1 warning generated.
+b = BPF(text=bpf_text,
+ cflags=["-Wno-tautological-constant-out-of-range-compare"])
b.attach_kprobe(event="nfs_file_read", fn_name="trace_rw_entry")
b.attach_kprobe(event="nfs_file_write", fn_name="trace_rw_entry")
b.attach_kprobe(event="nfs4_file_open", fn_name="trace_file_open_entry")
@@ -286,6 +362,9 @@ b.attach_kretprobe(event="nfs4_file_open
b.attach_kretprobe(event="nfs_file_open", fn_name="trace_file_open_return")
b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return")

+b.attach_raw_tracepoint("nfs_initiate_commit", "raw_nfs_initiate_commit")
+b.attach_raw_tracepoint("nfs_commit_done", "raw_nfs_commit_done")
+
if(csv):
print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")
else:

Join iovisor-dev@lists.iovisor.org to automatically receive all group messages.