From 6a58902bf7bfa82f65b21f209453253c3e1c6350 Mon Sep 17 00:00:00 2001 From: Karthik Subramanian Date: Sat, 13 May 2023 13:17:12 +0530 Subject: [PATCH 1/2] Included mountstats and nfsstats in logs 1. Included mountstats & nfsstats in the logs that we collect. 2. Cleaned up code and fixed two misc bugs (one in arg parsing, one in killing tcpdump). 3. Fixed Python invocation to be independend of the actual location of the Python interpreter in trace-nfsbpf. --- NfsDiagnostics/nfsclientlogs.sh | 132 ++++++++++++++++++++------------ NfsDiagnostics/trace-nfsbpf | 2 +- 2 files changed, 86 insertions(+), 48 deletions(-) diff --git a/NfsDiagnostics/nfsclientlogs.sh b/NfsDiagnostics/nfsclientlogs.sh index 7d9fbc2b..170f61d3 100644 --- a/NfsDiagnostics/nfsclientlogs.sh +++ b/NfsDiagnostics/nfsclientlogs.sh @@ -1,24 +1,24 @@ #!/bin/bash -#pid file -pidfile="/tmp/nfsclientlog.pid" +# The first line of this file contains the pid of the trace-cmd instance that we +# spawn. If there is a second line, it will contain the pid of the tcpdump +# instance that we spawn. +BG_JOBS_PIDFILE="/tmp/nfsclientlog.pid" -# prog file -progfile="nohup trace-cmd record -e nfs" +invoke_trace_cmd="nohup trace-cmd record -e nfs" # tcpdump -if [ $1 == "v3b" ] +invoke_tcpdump="nohup tcpdump -p -s 0 -w nfs_traffic.pcap port 2049" +if [ "$1" == "v3b" ] then - progfile1="nohup tcpdump -p -s 0 -w nfs_traffic.pcap port 111" -else - progfile1="nohup tcpdump -p -s 0 -w nfs_traffic.pcap port 2049" + invoke_tcpdump="nohup tcpdump -p -s 0 -w nfs_traffic.pcap port 111" fi # trace-nfsbpf -ABSOLUTE_PATH="$(cd "$(dirname "trace-nfsbpf")" && pwd)/$(basename "trace-nfsbpf")" -progfile2="nohup $ABSOLUTE_PATH" +NFSBPF_ABSOLUTE_PATH="$(cd "$(dirname "trace-nfsbpf")" && pwd)/$(basename "trace-nfsbpf")" +invoke_trace_nfsbpf="nohup $NFSBPF_ABSOLUTE_PATH" -# Sanity to check whether trace-cmd is installed. +# Check if trace-cmd is installed. which trace-cmd > /dev/null if [ $? == 1 ]; then echo "trace-cmd is not installed." @@ -26,24 +26,39 @@ if [ $? == 1 ]; then fi start() { - dmesg -Tc > /dev/null + local saved_pid; + local trace_cmd_pid; + local retry; + rm -f nfs_traffic.pcap + + dmesg -Tc > /dev/null + + date >nfsstat.pre.out + echo >>nfsstat.pre.out + nfsstat -o nfs -l |sort -k5 -nr >>nfsstat.pre.out + + date >mountstats.pre.out + echo >>mountstats.pre.out + mountstats >>mountstats.pre.out + rpcdebug -m rpc -s all rpcdebug -m nfs -s all + retry=0 - if [ -f "$pidfile" ]; then - read pid < $pidfile; - pgrep_pid=`pgrep trace-cmd | head -1` - if [ "$pid" == "$pgrep_pid" ] + if [ -f "$BG_JOBS_PIDFILE" ]; then + read saved_pid < $BG_JOBS_PIDFILE; + trace_cmd_pid=`pgrep trace-cmd | head -1` + if [ "$saved_pid" == "$trace_cmd_pid" ] then echo "[error] [`date +'%FT%H:%M:%S%z'`] trace-cmd is already running, restarting trace-cmd." - kill -INT $pid - ps -p "$pid" > /dev/null + kill -INT $saved_pid + ps -p "$saved_pid" > /dev/null while [ $? == 0 ] && [ $retry -lt 10 ] do retry=`expr $retry + 1` sleep 1 - ps -p "$pid" > /dev/null + ps -p "$saved_pid" > /dev/null done if [ $retry -eq 10 ]; then echo "[error] [`date +'%FT%H:%M:%S%z'`] Restarting trace-cmd failed. Exiting.." @@ -53,71 +68,94 @@ start() { fi fi - $progfile 0<&- > /dev/null 2>&1 & - - # save the pid to a file - echo $! > $pidfile + $invoke_trace_cmd 0<&- > /dev/null 2>&1 & + # save the trace_cmd pid + echo $! > $BG_JOBS_PIDFILE if [ "$1" == "CaptureNetwork" ] || [ "$2" == "CaptureNetwork" ]; then - $progfile1 0<&- > /dev/null 2>&1 & - echo $! >> $pidfile + # If we're spawning tcpdump, save its pid too. + $invoke_tcpdump 0<&- > /dev/null 2>&1 & + echo $! >> $BG_JOBS_PIDFILE fi if [ "$1" == "OnAnomaly" ] || [ "$2" == "OnAnomaly" ]; then - $progfile2 0<&- > /dev/null 2>&1 & - fi + $invoke_trace_nfsbpf 0<&- > /dev/null 2>&1 & + fi } stop() { - retry=0 + local trace_cmd_saved_pid; + local tcpdump_saved_pid; + local retry; + rm -rf nfs_trace - if [ -f "$pidfile" ]; then + + retry=0 + if [ -f "$BG_JOBS_PIDFILE" ]; then while read -r line do - read -r tcpdump_pid - pid=$line - done < $pidfile; - pgrep_pid=`pgrep trace-cmd | head -1` - if [ "$pid" != "" ] && [ "$pid" == "$pgrep_pid" ] + read -r tcpdump_saved_pid + trace_cmd_saved_pid=$line + done < $BG_JOBS_PIDFILE; + trace_cmd_pid=`pgrep trace-cmd | head -1` + if [ "$trace_cmd_saved_pid" != "" ] && [ "$trace_cmd_saved_pid" == "$trace_cmd_pid" ] then - kill -INT $pid - ps -p "$pid" > /dev/null + kill -INT $trace_cmd_saved_pid + ps -p "$trace_cmd_saved_pid" > /dev/null while [ $? == 0 ] && [ $retry -lt 10 ] do retry=`expr $retry + 1` sleep 1 - ps -p "$pid" > /dev/null + ps -p "$trace_cmd_saved_pid" > /dev/null done trace-cmd report > nfs_trace if [ $? != 0 ]; then - rm -f $pidfile + rm -f $BG_JOBS_PIDFILE return 1 fi - rm -f $pidfile + rm -f $BG_JOBS_PIDFILE else - rm -f $pidfile + rm -f $BG_JOBS_PIDFILE return 1 fi - pgrep_tcpdump_pid=`pgrep tcpdump | head -1` - if [ "$tcpdump_pid" == "$pgrep_tcpdump_pid" ] && [ "$tcpdump_pid" != "" ] + tcpdump_pid=`pgrep tcpdump | head -1` + if [ "$tcpdump_saved_pid" == "$tcpdump_pid" ] && [ "$tcpdump_saved_pid" != "" ] then - sudo kill -INT $tcpdump_pid - ps -p "$tcpdump_pid" > /dev/null + sudo kill -INT $tcpump_saved_pid + ps -p "$tcpump_saved_pid" > /dev/null while [ $? == 0 ] && [ $retry -lt 10 ] do retry=`expr $retry + 1` sleep 1 - ps -p "$pid" > /dev/null + ps -p "$tcpdump_saved_pid" > /dev/null done fi else - rm -f $pidfile + rm -f $BG_JOBS_PIDFILE return 1 fi + rpcdebug -m rpc -c all rpcdebug -m nfs -c all + dmesg -T > nfs_dmesg - zip nfs_debug.zip nfs_dmesg nfs_trace nfs_traffic.pcap + date >nfsstat.post.out + echo >>nfsstat.post.out + nfsstat -o nfs -l |sort -k5 -nr >>nfsstat.post.out + + date >mountstats.post.out + echo >>mountstats.post.out + mountstats >>mountstats.post.out + + zip nfs_debug.zip \ + nfs_dmesg \ + nfs_trace \ + nfs_traffic.pcap \ + nfsstat.pre.out \ + nfsstat.post.out \ + mountstats.pre.out \ + mountstats.post.out + return 0; } diff --git a/NfsDiagnostics/trace-nfsbpf b/NfsDiagnostics/trace-nfsbpf index e6f83c03..7d72b00a 100644 --- a/NfsDiagnostics/trace-nfsbpf +++ b/NfsDiagnostics/trace-nfsbpf @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python from bcc import BPF from bcc.utils import printb from time import sleep From ada27ac08006f676296314372efd1ccb3cfa455a Mon Sep 17 00:00:00 2001 From: Karthik Subramanian Date: Tue, 30 Jan 2024 10:54:10 +0530 Subject: [PATCH 2/2] Review fix: Use realpath for trace-nfsbpf --- NfsDiagnostics/nfsclientlogs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NfsDiagnostics/nfsclientlogs.sh b/NfsDiagnostics/nfsclientlogs.sh index 170f61d3..0d89105a 100644 --- a/NfsDiagnostics/nfsclientlogs.sh +++ b/NfsDiagnostics/nfsclientlogs.sh @@ -15,7 +15,7 @@ then fi # trace-nfsbpf -NFSBPF_ABSOLUTE_PATH="$(cd "$(dirname "trace-nfsbpf")" && pwd)/$(basename "trace-nfsbpf")" +NFSBPF_ABSOLUTE_PATH="$(realpath "trace-nfsbpf")" invoke_trace_nfsbpf="nohup $NFSBPF_ABSOLUTE_PATH" # Check if trace-cmd is installed.