#!/bin/bash
# SPDX-License-Identifier: GPL-2.0

#
# This file is part of Lustre, http://www.lustre.org/
#

print_help() {
	cat <<EOF
Usage:
${0##*/} -f "nid1[ nid2...]" -t "nidA[ nidB...]" [options]
or
${0##*/} -H -f "host1[ host2...]" -t "hostA[ hostB...]" [options]

Options:
	-c concurrency
	   The number of requests that are active at one time. Default is 64.
	-d
	   Debug mode. Outputs the generated lst.sh commands, but does not
	   execute them.
	-e
	   Lists the number of failed RPCs on test nodes in the current session.
	-D delay
	   The interval of the statistics (in seconds). Default is $STAT_DELAY.
	-h
	   Display this help.
	-H
	   Run in "host mode". Host mode indicates that the arguments to '-t'
	   and '-f' flags are hostnames rather than LNet nids.
	-f "nid1[ nid2...]"
	   Space-separated list of LNet NIDs to place in the "clients" group.
	   When '-H' flag is specified, the '-f' argument is a space-separated
	   list of hostnames.
	-g servers|clients
	   Report stats from the specified group. Either 'clients' or
	   'servers'. Default is 'servers'.
	-m read|write|rw|ping<,read|write|rw|ping<,...>>
	   Execute the specified list of tests. Default is ${MODE_LIST// /, }.
	-M group_size
	   Subdivide the client group (-f) into multiple groups of the
	   specified size. Every client group is tested against every server
	   group (see -t and -N).
	-n count
	   The number of stat RPCs to issue. Default is $STAT_COUNT.
	-N group_size
	   Subdivide the server group (-t) into multiple groups of the
	   specified size. Every server group is tested against every client
	   group (see -f and -M).
	-O output_dir
	   Create output files in specified directory.
	   Default is PWD/lst_survey.<timestamp>
	-t "nid1[ nid2...]"
	   Space-separated list of LNet NIDs to place in the "servers" group.
	   When '-H' flag is specified, the '-t' argument is a space-separated
	   list of hostnames.
	-s bulksize1<,bulksize2<,...>>
	   For each read, write, or combined read-write test, execute the test
	   with the specified bulk sizes. Default is 4k and 1m.
	-S separator
	   Use the specified character to separate fields in the .csv output
	   file. Default is ','.
	-v
	   Prints additional output. e.g. LST parameters, group construction,
	   etc.
EOF
	exit
}

verbose() {
	${VERBOSE} && echo "$@"
}

SERVERS=""
CLIENTS=""
CONCURRENCY=64
HOST_MODE=false
LST_DEBUG=false
MODE_LIST="read write ping"
C_GRP_SIZE=""
S_GRP_SIZE=""
SEP=','
SIZE_LIST="4k 1m"
SHOW_ERRORS=false
STAT_COUNT=3
STAT_DELAY=3
STAT_GROUP="servers"
TS=$(date +%s)
TEST_DIR=$PWD/lst_survey.${TS}
VERBOSE=false
while getopts "c:dD:e:Hhf:g:m:M:n:N:O:t:s:S:v" flag ; do
	case $flag in
		c) CONCURRENCY="$OPTARG";;
		d) LST_DEBUG=true;;
		D) STAT_DELAY="$OPTARG";;
		e) SHOW_ERRORS=true;;
		H) HOST_MODE=true;;
		h) print_help;;
		f) CLIENTS="$OPTARG";;
		g) STAT_GROUP="$OPTARG";;
		m) MODE_LIST="$OPTARG";;
		M) C_GRP_SIZE="$OPTARG";;
		n) STAT_COUNT="$OPTARG";;
		N) S_GRP_SIZE="$OPTARG";;
		O) TEST_DIR="$OPTARG";;
		t) SERVERS="$OPTARG";;
		s) SIZE_LIST="$OPTARG";;
		S) SEP="${OPTARG}";;
		v) VERBOSE=true;;
		*) echo "Unrecognized option '-$flag'"
		   exit 1;;
	esac
done

LSTSH=${LSTSH:-$(dirname "$0")/lst.sh}
if ! [[ -f $LSTSH ]]; then
	LSTSH=$(which lst.sh 2>/dev/null)
fi

if ! [[ -f $LSTSH ]]; then
	echo "Cannot find lst.sh script at $LSTSH"
	exit 1
fi

if [[ -z $CLIENTS ]]; then
	echo "Must specify \"clients\" group (-f)"
	exit 1
elif [[ -z $SERVERS ]]; then
	echo "Must specify \"servers\" group (-t)"
	exit 1
fi

IFS=" " read -r -a CLIENTS <<< "${CLIENTS//,/ }"
[[ -z $C_GRP_SIZE ]] &&
	C_GRP_SIZE=${#CLIENTS[@]}

IFS=" " read -r -a SERVERS <<< "${SERVERS//,/ }"
[[ -z $S_GRP_SIZE ]] &&
	S_GRP_SIZE=${#SERVERS[@]}

if [[ $STAT_COUNT -lt 1 ]]; then
	echo "Stat count must be > 0 (-n count)"
	exit 1
elif [[ $C_GRP_SIZE -lt 1 ]]; then
	echo "Client group size must be > 0 (-M group_size)"
	exit 1
elif [[ $C_GRP_SIZE -gt ${#CLIENTS[@]} ]]; then
	echo "Specified client group size (-M $C_GRP_SIZE) cannot be larger than number of clients specified with -f (${#CLIENTS[@]})"
	exit 1
elif [[ $S_GRP_SIZE -lt 1 ]]; then
	echo "Server group size must be > 0 (-N group_size)"
	exit 1
elif [[ $S_GRP_SIZE -gt ${#SERVERS[@]} ]]; then
	echo "Specified server group size (-M $S_GRP_SIZE) cannot be larger than number of servers specified with -t (${#SERVERS[@]})"
	exit 1
elif ! [[ $STAT_GROUP =~ ^(servers|clients)$ ]]; then
	echo "Invalid stat group $STAT_GROUP (-g servers|clients)"
	exit 1
elif [[ -z $MODE_LIST ]]; then
	echo "Empty mode list (-m read|write|rw|ping)"
	exit 1
elif [[ -z $SIZE_LIST ]]; then
	echo "Empty bulk size list (-s 1024|4k|1m)"
	exit 1
fi

for m in $MODE_LIST; do
	if ! [[ $m =~ (read|write|rw|ping) ]]; then
		echo "Invalid mode \"$m\" specified (-m read|write|rw|ping)"
		exit 1
	fi
done

if ! mkdir -p "${TEST_DIR}" ; then
	echo "Failed to create results directory at \"${TEST_DIR}\" rc=$?"
	exit 1
fi
OUTFILE=${TEST_DIR}/results.${TS}.csv

LST_OPTIONS="-c $CONCURRENCY -n $STAT_COUNT -D $STAT_DELAY -e -S \"bw rate\""
LST_OPTIONS+=" -g ${STAT_GROUP} -e"
if ${HOST_MODE}; then
	LST_OPTIONS+=" -H"
fi

print_results() {
	local mode="$1"
	local size="$2"

	if ${LST_DEBUG}; then
		return
	fi

	[[ $mode != ping ]] &&
		mode="${mode}_${size}"

	{
		echo -n "${SEP}${mode}"
		echo -n "${SEP}${RD_BW_AVG}${SEP}${RD_RATE_AVG}"
		echo -n "${SEP}${W_BW_AVG}${SEP}${W_RATE_AVG}"
		echo "${SEP}${SERVER_ERRORS}${SEP}${CLIENT_ERRORS}"
	}>>"${OUTFILE}"

	printf "%14s  %14s  %15s  %14s  %15s\n" \
		"${mode}" "${RD_BW_AVG}" "${RD_RATE_AVG}" "${W_BW_AVG}" \
		"${W_RATE_AVG}"
}

SERVER_ERRORS=0
CLIENT_ERRORS=0
RD_RATE_AVG=0
W_RATE_AVG=0
RD_BW_AVG=0
W_BW_AVG=0
do_lst() {
	local mode="$1"
	shift
	local lst_args="$*"

	RD_RATE_AVG=0
	W_RATE_AVG=0
	RD_BW_AVG=0
	W_BW_AVG=0

	declare -a vals

	if ${LST_DEBUG}; then
		echo "$LSTSH ${lst_args}"
		return
	fi
	IFS=" " read -r -a vals <<< "$(eval "$LSTSH" "${lst_args}" 2>&1 |
				       tee -a "${TEST_DIR}"/lst."${TS}".out |
				       awk '/^\[(R|W)\]/{print $3};
				            /error nodes in/{print $2}' |
				       xargs echo)"

	# Each stat RPC generates 4 lines of output, and we have two lines for
	# the error counts
	local expect=$((2 + STAT_COUNT * 4))

	if [[ ${#vals[@]} -ne $expect ]]; then
		echo
		echo "Error: Failed to get all samples. Expect $expect, found ${#vals[@]}"
		exit
	fi

	local i rd_rate w_rate rd_bw w_bw
	for ((i = 0; i < $((expect - 4)); i+=4)); do
		rd_rate=${vals[i]}
		w_rate=${vals[i+1]}
		rd_bw=${vals[i+2]}
		w_bw=${vals[i+3]}

		RD_RATE_AVG="${RD_RATE_AVG:+$RD_RATE_AVG +} $rd_rate"
		W_RATE_AVG="${W_RATE_AVG:+$W_RATE_AVG +} $w_rate"
		RD_BW_AVG="${RD_BW_AVG:+$RD_BW_AVG +} $rd_bw"
		W_BW_AVG="${W_BW_AVG:+$W_BW_AVG +} $w_bw"
	done

	RD_RATE_AVG=$(echo "($RD_RATE_AVG)/$STAT_COUNT" | bc)
	W_RATE_AVG=$(echo "($W_RATE_AVG)/$STAT_COUNT" | bc)
	RD_BW_AVG=$(echo "($RD_BW_AVG)/$STAT_COUNT" | bc)
	W_BW_AVG=$(echo "($W_BW_AVG)/$STAT_COUNT" | bc)

	SERVER_ERRORS=$((SERVER_ERRORS + ${vals[$expect - 2]}))
	CLIENT_ERRORS=$((CLIENT_ERRORS + ${vals[$expect - 1]}))
}

run_test() {
	local server_group="$1"
	local client_group="$2"

	if ! ${LST_DEBUG}; then
		echo
		echo "Commence lst-survey - $(date)"
		echo "Server Group: ${server_group}"
		echo "Client Group: ${client_group}"
		echo
		printf "%14s  %14s  %15s  %14s  %15s\n" \
			"Mode" "Read MB/s" "Read RPC/s" "Write MB/S" "Write RPC/s"
	fi

	SERVER_ERRORS=0 # See do_lst()
	CLIENT_ERRORS=0 # See do_lst()

	local lst_args
	lst_args="-t \"${server_group}\""
	lst_args+=" -f \"${client_group}\""
	lst_args+=" -d ${C_GRP_SIZE}:${S_GRP_SIZE} $LST_OPTIONS"

	local bulksize mode
	for mode in ${MODE_LIST//,/ }; do
		for bulksize in ${SIZE_LIST//,/ } ping; do
			[[ $bulksize == ping ]] && [[ $mode != ping ]] &&
				continue
			[[ $bulksize != ping ]] && [[ $mode == ping ]] &&
				continue

			{
				echo -n "${server_group}"
				echo -n "${SEP}${client_group}"
			}>>"${OUTFILE}"
			do_lst "$mode" "${lst_args} -m $mode -s $bulksize"
			print_results "$mode" "$bulksize"
		done
	done

	if ${SHOW_ERRORS} && ! ${LST_DEBUG}; then
		echo "Server Errors: ${SERVER_ERRORS}"
		echo "Client Errors: ${CLIENT_ERRORS}"
	fi

	if ! ${LST_DEBUG}; then
		echo
		echo "Finished lst-survey - $(date)"
	fi
}

{
	echo -n "Servers${SEP}Clients${SEP}"
	echo -n "Mode${SEP}Read_BW${SEP}Read_Rate${SEP}"
	echo -n "Write_BW${SEP}Write_Rate${SEP}"
	echo "Server_Errors${SEP}Client_Errors"
}>>"${OUTFILE}"

declare -a s_groups
n_s_groups=$((${#SERVERS[@]} / S_GRP_SIZE))
verbose "Creating $n_s_groups server group(s) of size $S_GRP_SIZE"
s_count=0
s_grp_idx=0
s_grp_str=""
for s in "${SERVERS[@]}"; do
	((s_count++))
	s_grp_str="${s_grp_str:+$s_grp_str }${s}"
	if [[ $s_count -eq $S_GRP_SIZE ]]; then
		s_groups[s_grp_idx]="$s_grp_str"
		((s_grp_idx++))
		verbose "Server group $s_grp_idx: $s_grp_str"
		s_count=0
		s_grp_str=""
	fi
done

declare -a c_groups
n_c_groups=$((${#CLIENTS[@]} / C_GRP_SIZE))
verbose "Creating $n_c_groups client group(s) of size $C_GRP_SIZE"
c_count=0
c_grp_idx=0
c_grp_str=""
for c in "${CLIENTS[@]}"; do
	((c_count++))
	c_grp_str="${c_grp_str:+$c_grp_str }${c}"
	if [[ $c_count -eq $C_GRP_SIZE ]]; then
		c_groups[c_grp_idx]="$c_grp_str"
		((c_grp_idx++))
		verbose "Client group $c_grp_idx: $c_grp_str"
		c_count=0
		c_grp_str=""
	fi
done

verbose "Arguments to $LSTSH: $LST_OPTIONS"

echo "CSV results: ${OUTFILE}"
echo "LST output: ${TEST_DIR}/lst.${TS}.out"

for ((s_grp_idx = 0; s_grp_idx < n_s_groups; s_grp_idx++)); do
	for ((c_grp_idx = 0; c_grp_idx < n_c_groups; c_grp_idx++)); do
		run_test "${s_groups[s_grp_idx]}" "${c_groups[c_grp_idx]}"
	done
done
