While using ext4dist on RHEL7 we occasionally get
following screwed up latencies:
# ext4dist
Tracing ext4 operation latency... Hit Ctrl-C to end.
^C
operation = write
usecs : count distribution
0 -> 1 : 1134529 |******** |
2 -> 3 : 2777582 |********************|
4 -> 7 : 688014 |**** |
8 -> 15 : 36160 | |
16 -> 31 : 698 | |
32 -> 63 : 6 | |
64 -> 127 : 15 | |
128 -> 255 : 7 | |
256 -> 511 : 1 | |
512 -> 1023 : 0 | |
1024 -> 2047 : 0 | |
2048 -> 4095 : 2 | |
4096 -> 8191 : 1 | |
8192 -> 16383 : 5 | |
16384 -> 32767 : 0 | |
32768 -> 65535 : 0 | |
9007199254740992 -> 18014398509481983 : 0 | |
18014398509481984 -> 36028797018963967 : 1 | |
The reason for this is that on RHEL7 it's possible to get backward
timestamp with bpf_ktime_get_ns. This needs to be fixed, but meanwhile
this fix makes sure the latencies with backward times are skipped.
For the rest of the kernels this is just sanity fix with
possibly just single compare instruction overhead.
Signed-off-by: Jiri Olsa <jolsa@...>
---
tools/ext4dist.py | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/tools/ext4dist.py b/tools/ext4dist.py
index 227c1386c553..cb950737686d 100755
--- a/tools/ext4dist.py
+++ b/tools/ext4dist.py
@@ -110,14 +110,20 @@ static int trace_return(struct pt_regs *ctx, const char *op)
if (tsp == 0) {
return 0; // missed start or filtered
}
- u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR;
+ s64 delta = bpf_ktime_get_ns() - *tsp;
+ start.delete(&pid);
+
+ // skip entries with backward time
+ if (delta < 0)
+ return 0;
+
+ delta /= FACTOR;
// store as histogram
dist_key_t key = {.slot = bpf_log2l(delta)};
__builtin_memcpy(&key.op, op, sizeof(key.op));
dist.increment(key);
- start.delete(&pid);
return 0;
}
--
1.8.3.1