Hi, On Tue, Nov 15, 2022 at 2:04 AM TATSUKAWA KOSUKE(立川 江介) <tatsu-ab1@...> wrote:
Hi,
I was using nfsslower in bcc tools and noticed that it doesn't print out slow commit operations, which sometimes cause long delays in NFS writes. I tried adding code to trace slow commit operations and created the attached patch. The concept and code looks okay to me. One thing is you could use RAW_TRACEPOINT_PROBE macro to be consistent with some other bcc tools raw tracepoint usages.
Please submit a pull request. thank you for the reply. I'll change the code to use RAW_TRACEPOINT_PROBE macro used in other bcc tools. While testing on some old environments which doesn't support raw tracepoints, I noticed that linux/nfs_fs.h header file causes bcc to print the error message "use of undeclared identifier 'KBUILD_MODNAME'". This problem seems to affect other bcc tools, which #define this symbol to work around it. I'll submit a pull request once I've done a little more testing. Best regards. Please let me hear your opinions.
NFS commits are quite slow and are not performed often, so the patch shouldn't incur visible overhead.
Below is a sample output from the modified tool when writing a large file using dd command. Most commits are issued from kworker threads, so the commit output may be mostly dropped when filtered by PID.
TIME COMM PID T BYTES OFF_KB LAT(ms) FILENAME ... 16:46:11 dd 1212 W 1048576 651264 46.58 testfile 16:46:11 dd 1212 W 1048576 653312 54.41 testfile 16:46:11 dd 1212 W 1048576 654336 18.96 testfile 16:46:11 dd 1212 W 1048576 655360 49.05 testfile 16:46:11 dd 1212 W 1048576 657408 82.96 testfile 16:46:11 dd 1212 W 1048576 659456 109.25 testfile 16:46:12 dd 1212 W 1048576 660480 163.55 testfile 16:46:12 dd 1212 W 1048576 662528 205.44 testfile 16:46:13 dd 1212 W 1048576 663552 751.02 testfile 16:46:37 kworker/u8:5 1207 C 0 0 27449.05 testfile 16:46:37 kworker/u8:5 1207 C 0 0 26725.16 testfile 16:46:37 kworker/u8:5 1207 C 0 0 27592.04 testfile 16:46:37 kworker/u8:4 1206 C 0 0 22188.75 testfile 16:46:37 kworker/u8:4 1206 C 0 0 26092.59 testfile 16:46:37 kworker/u8:4 1206 C 0 0 27268.90 testfile 16:46:37 kworker/u8:4 1206 C 0 0 22303.24 testfile 16:46:37 kworker/u8:4 1206 C 0 0 27081.34 testfile 16:46:37 dd 1212 W 1048576 664576 24337.80 testfile 16:46:38 dd 1212 W 1048576 958464 61.77 testfile 16:46:38 dd 1212 W 1048576 960512 56.60 testfile 16:46:38 dd 1212 W 1048576 963584 55.75 testfile 16:46:38 dd 1212 W 1048576 965632 54.84 testfile ...
Best regards. --- Kosuke TATSUKAWA | 1st Platform Software Division | NEC Solution Innovators | tatsu-ab1@...
------------------------------------------------------------------------ diff -rup a/tools/nfsslower.py b/tools/nfsslower.py --- a/tools/nfsslower 2022-11-14 12:39:23.677073240 +0900 +++ b/tools/nfsslower 2022-11-14 14:44:49.960073240 +0900 @@ -9,6 +9,8 @@ # This script traces some common NFS operations: read, write, opens and # getattr. It measures the time spent in these operations, and prints details # for each that exceeded a threshold. +# The script also traces commit operations, which is specific to nfs and could +# be pretty slow. # # WARNING: This adds low-overhead instrumentation to these NFS operations, # including reads and writes from the file system cache. Such reads and writes @@ -25,6 +27,8 @@ # Currently there aren't any tracepoints available for nfs_read_file, # nfs_write_file and nfs_open_file, nfs_getattr does have entry and exit # tracepoints but we chose to use kprobes for consistency +# Raw tracepoints are used to trace nfs:nfs_initiate_commit and +# nfs:nfs_commit_done. # # 31-Aug-2017 Samuel Nair created this. Should work with NFSv{3,4}
@@ -41,8 +45,8 @@ examples = """ ./nfsslower -p 121 # trace pid 121 only """ parser = argparse.ArgumentParser( - description="""Trace READ, WRITE, OPEN \ -and GETATTR NFS calls slower than a threshold,\ + description="""Trace READ, WRITE, OPEN, GETATTR \ +and COMMIT NFS calls slower than a threshold,\ supports NFSv{3,4}""", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) @@ -66,11 +70,13 @@ bpf_text = """ #include <linux/fs.h> #include <linux/sched.h> #include <linux/dcache.h> +#include <linux/nfs_fs.h>
#define TRACE_READ 0 #define TRACE_WRITE 1 #define TRACE_OPEN 2 #define TRACE_GETATTR 3 +#define TRACE_COMMIT 4
struct val_t { u64 ts; @@ -79,6 +85,12 @@ struct val_t { struct dentry *d; };
+struct commit_t { + u64 ts; + u64 offset; + u64 count; +}; + struct data_t { // XXX: switch some to u32's when supported u64 ts_us; @@ -93,6 +105,7 @@ struct data_t {
BPF_HASH(entryinfo, u64, struct val_t); BPF_PERF_OUTPUT(events); +BPF_HASH(commitinfo, u64, struct commit_t);
int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb, struct iov_iter *data) @@ -227,6 +240,56 @@ int trace_getattr_return(struct pt_regs return trace_exit(ctx, TRACE_GETATTR); }
+int raw_nfs_initiate_commit(struct bpf_raw_tracepoint_args *ctx) { + u64 key = (u64)ctx->args[0]; + struct nfs_commit_data *cd = (struct nfs_commit_data *)key; + struct commit_t c = { 0 }; + + c.ts = bpf_ktime_get_ns(); + bpf_probe_read_kernel(&c.offset, sizeof(cd->args.offset), + &cd->args.offset); + bpf_probe_read_kernel(&c.count, sizeof(cd->args.count), &cd->args.count); + commitinfo.update(&key, &c); + return 0; +} + +int raw_nfs_commit_done(struct bpf_raw_tracepoint_args *ctx) { + u64 key = (u64)ctx->args[1]; + struct commit_t *cp = commitinfo.lookup(&key); + + if (cp) { + struct nfs_commit_data *cd = (struct nfs_commit_data *)key; + struct nfs_open_context *p; + struct dentry *de; + struct qstr qs; + u64 ts = bpf_ktime_get_ns(); + u64 delta_us = (ts - cp->ts) / 1000; + u32 pid = bpf_get_current_pid_tgid() >> 32; + struct data_t data = {.type = TRACE_COMMIT, .offset = cp->offset, + .size = cp->count, .ts_us = ts/1000, .delta_us = delta_us, + .pid = pid}; + + commitinfo.delete(&key); + bpf_get_current_comm(&data.task, sizeof(data.task)); + + if(FILTER_PID) + return 0; + + if (FILTER_US) + return 0; + + bpf_probe_read_kernel(&p, sizeof(p), &cd->context); + bpf_probe_read_kernel(&de, sizeof(de), &p->dentry); + bpf_probe_read_kernel(&qs, sizeof(qs), &de->d_name); + if (qs.len) { + bpf_probe_read_kernel(&data.file, sizeof(data.file), + (void *)qs.name); + events.perf_submit(ctx, &data, sizeof(data)); + } + } + return 0; +} + """ if min_ms == 0: bpf_text = bpf_text.replace('FILTER_US', '0') @@ -253,6 +316,8 @@ def print_event(cpu, data, size): type = 'O' elif event.type == 3: type = 'G' + elif event.type == 4: + type = 'C'
if(csv): print("%d,%s,%d,%s,%d,%d,%d,%s" % ( @@ -273,7 +338,18 @@ def print_event(cpu, data, size): # Currently specifically works for NFSv4, the other kprobes are generic # so it should work with earlier NFS versions
-b = BPF(text=bpf_text) +# The following warning is shown on kernels after linux-5.18 when using bcc. +# Add compile option to silence it. +# In file included from /virtual/main.c:7: +# In file included from include/linux/nfs_fs.h:31: +# In file included from include/linux/sunrpc/auth.h:13: +# In file included from include/linux/sunrpc/sched.h:19: +# include/linux/sunrpc/xdr.h:751:10: warning: result of comparison of constant 4611686018427387903 with expression of type '__u32' (aka 'unsigned int') is always false [-Wtautological-constant-out-of-range-compare] +# if (len > SIZE_MAX / sizeof(*p)) +# ~~~ ^ ~~~~~~~~~~~~~~~~~~~~~ +# 1 warning generated. +b = BPF(text=bpf_text, + cflags=["-Wno-tautological-constant-out-of-range-compare"]) b.attach_kprobe(event="nfs_file_read", fn_name="trace_rw_entry") b.attach_kprobe(event="nfs_file_write", fn_name="trace_rw_entry") b.attach_kprobe(event="nfs4_file_open", fn_name="trace_file_open_entry") @@ -286,6 +362,9 @@ b.attach_kretprobe(event="nfs4_file_open b.attach_kretprobe(event="nfs_file_open", fn_name="trace_file_open_return") b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return")
+b.attach_raw_tracepoint("nfs_initiate_commit", "raw_nfs_initiate_commit") +b.attach_raw_tracepoint("nfs_commit_done", "raw_nfs_commit_done") + if(csv): print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") else:
--- Kosuke TATSUKAWA | 1st Platform Software Division | NEC Solution Innovators | tatsu-ab1@...
|