#!/bin/bash

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

#
# Watchdog for monitoring unmounts and stunnel processes
#

mtime_mountmap=0
mtime_mountmap_notls=0
NFSFS_SERVERS_FILE="/proc/fs/nfsfs/servers"

# How often to probe the storage IP for reachability (seconds).
HEALTH_CHECK_FREQUENCY=60

#
# Check conntrack entries for NFSv4 non-TLS DNAT mounts (port 2049).
# Returns 0 if stuck entries were found and cleaned (failure signal).
#
reconcile_conntrack_v4()
{
    local l_ip=$1
    local l_nfsip=$2
    local found_stuck=1

    IFS=$'\n' output2049=$(conntrack -L -p tcp -d $l_ip -r $l_nfsip --dport 2049 --state SYN_SENT 2>/dev/null)

    if [ -n "$output2049" ]; then
        for entry in $output2049; do
            vecho "$entry"
            matchstr="tcp\s+[0-9]+\s+([0-9]+)\s+SYN_SENT\s+src=[0-9]+.[0-9]+.[0-9]+.[0-9]+\s+dst=${l_ip}\s+sport=([0-9]+)\s+dport=([0-9]+).*"
            if [[ "$entry" =~ $matchstr ]]; then
                l_seconds_remaining=${BASH_REMATCH[1]}
                l_sport=${BASH_REMATCH[2]}
                l_dport=${BASH_REMATCH[3]}
                reconcile_conntrack_synsent $l_ip $l_sport $l_dport $l_nfsip $l_seconds_remaining
                found_stuck=0
            fi
        done
    fi

    IFS=$'\n' output2049=$(conntrack -L -p tcp -d $l_ip --dport 2049 2>/dev/null | grep -v "SYN_SENT" | grep "\[UNREPLIED\]")
    if [ -n "$output2049" ]; then
        for entry in $output2049; do
            vecho "$entry"
            matchstr="tcp\s+[0-9]+\s+([0-9]+) .* dst=$l_ip sport=([0-9]+) dport=([0-9]+) \[UNREPLIED\] src=([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+).*"
            if [[ "$entry" =~ $matchstr ]]; then
                l_seconds_remaining=${BASH_REMATCH[1]}
                l_sport=${BASH_REMATCH[2]}
                l_dport=${BASH_REMATCH[3]}
                l_reply_srcip=${BASH_REMATCH[4]}
                reconcile_conntrack_unreplied $l_ip $l_sport $l_dport $l_reply_srcip $l_seconds_remaining
                found_stuck=0
            fi
        done
    fi

    return $found_stuck
}

#
# Failover a TLS stunnel connection to a new backend IP.
# Creates new stunnel conf/log/pid files with the new IP, restarts stunnel
# on the same port, cleans up old files, and updates MOUNTMAPv4.
# Must be called with MOUNTMAPv4 NOT locked (this function acquires the lock).
#
failover_stunnel()
{
    local l_host=$1
    local l_ip=$2
    local l_conf=$3
    local l_log=$4
    local l_pid=$5
    local l_checksumhash=$6
    local old_line=$7
    local new_ip=$8

    local old_ip=$l_ip
    local accept_port=$(cat $l_conf | grep accept | cut -d ':' -f 2)

    # Derive directories and new file paths from the old paths.
    local stunnel_dir=$(dirname $l_conf)
    local stunnel_log_dir=$(dirname $l_log)
    local new_conf="$stunnel_dir/stunnel_${new_ip}.conf"
    local new_log="$stunnel_log_dir/stunnel_${new_ip}.log"
    local new_pid="$stunnel_log_dir/stunnel_${new_ip}.pid"

    pecho "Stunnel failover for $l_host: $old_ip -> $new_ip (port $accept_port)"

    # Lock MOUNTMAPv4 for the entire failover operation so the watchdog's
    # checksum tamper detection doesn't race with our config update.
    exec {fd_fo}<$MOUNTMAPv4
    flock -e $fd_fo

    # Step 1: Create new stunnel conf with updated connect, pid, and output paths.
    chattr -f -i $l_conf
    cp $l_conf $new_conf
    if [ $? -ne 0 ]; then
        eecho "Failed to create new stunnel conf $new_conf!"
        chattr -f +i $l_conf
        flock -u $fd_fo
        exec {fd_fo}<&-
        return 1
    fi

    # Update connect IP, pid path, log output path, and service section name in new conf.
    sed -i "s/connect = ${old_ip}:2049/connect = ${new_ip}:2049/" $new_conf
    sed -i "s#pid = ${l_pid}#pid = ${new_pid}#" $new_conf
    sed -i "s#output = ${l_log}#output = ${new_log}#" $new_conf
    sed -i "s/\[${old_ip}\]/[${new_ip}]/" $new_conf
    chattr -f +i $new_conf

    # Step 2: Kill old stunnel process and clean up old files.
    if [ -f "$l_pid" ]; then
        local pid=$(cat $l_pid)
        pecho "Killing stunnel process $pid for failover"
        kill -9 $pid 2>/dev/null
    fi

    # Delete old stunnel files.
    if [ -f "$l_log" ]; then
        rm -f $l_log
    fi
    if [ -f "$l_pid" ]; then
        rm -f $l_pid
    fi
    rm -f $l_conf

    # Step 3: Start new stunnel (creates new pid and log files automatically).
    local stunnel_status=$(stunnel $new_conf 2>&1)
    if [ -n "$stunnel_status" ]; then
        local is_binding_error=$(echo $stunnel_status | grep "$LOCALHOST:$accept_port: Address already in use")
        if [ -n "$is_binding_error" ]; then
            sleep 1
            stunnel_status=$(stunnel $new_conf 2>&1)
            if [ -n "$stunnel_status" ]; then
                eecho "Failed to restart stunnel on port $accept_port after failover: $stunnel_status"
                flock -u $fd_fo
                exec {fd_fo}<&-
                return 1
            fi
        else
            eecho "Failed to start stunnel after failover: $stunnel_status"
            flock -u $fd_fo
            exec {fd_fo}<&-
            return 1
        fi
    fi

    # Step 4: Recompute checksum and update MOUNTMAPv4 with new IP, new file paths, and new checksum.
    local new_checksum=$(cksum $new_conf | awk '{print $1}')

    # Build the new mountmap entry.
    local new_entry="${l_host};${new_ip};${new_conf};${new_log};${new_pid};${new_checksum};mounted;0"

    chattr -f -i $MOUNTMAPv4
    # Replace the entire old entry line (matched by old conf path) with the new entry.
    out=$(sed "\#${l_conf};#c\\${new_entry}" $MOUNTMAPv4)
    ret=$?
    if [ $ret -eq 0 ]; then
        echo "$out" > $MOUNTMAPv4
        ret=$?
        out=
        if [ $ret -ne 0 ]; then
            eecho "*** [FATAL] MOUNTMAPv4 may be in inconsistent state, contact Microsoft support ***"
        fi
    fi
    chattr -f +i $MOUNTMAPv4

    # Update mtime so the main loop knows we changed it.
    mtime_mountmap=$(stat -c%Y $MOUNTMAPv4)

    flock -u $fd_fo
    exec {fd_fo}<&-

    if [ $ret -ne 0 ]; then
        eecho "Failed to update MOUNTMAPv4 after stunnel failover!"
        return 1
    fi

    pecho "Stunnel failover complete for $l_host: $old_ip -> $new_ip"
    return 0
}

ip_to_hex()
{
    local ip=$1
    local IFS=.
    local o1 o2 o3 o4
    read -r o1 o2 o3 o4 <<< "$ip"
    if [ -z "$o1" -o -z "$o2" -o -z "$o3" -o -z "$o4" ]; then
        return 1
    fi
    if ! [[ "$o1" =~ ^[0-9]+$ && "$o2" =~ ^[0-9]+$ && "$o3" =~ ^[0-9]+$ && "$o4" =~ ^[0-9]+$ ]]; then
        return 1
    fi
    if [ "$o1" -gt 255 -o "$o2" -gt 255 -o "$o3" -gt 255 -o "$o4" -gt 255 ]; then
        return 1
    fi
    printf "%02x%02x%02x%02x" "$o1" "$o2" "$o3" "$o4"
}

is_nfs_server_active_for_target()
{
    local target_ip=$1
    local target_port=$2
    local target_ip_hex
    local target_port_hex

    if [ -z "$target_ip" -o -z "$target_port" ]; then
        vecho "NFSv4 server entry check: missing target ip/port (ip='$target_ip', port='$target_port')."
        return 1
    fi
    if ! [[ "$target_port" =~ ^[0-9]+$ ]]; then
        vecho "NFSv4 server entry check: invalid target port (port='$target_port')."
        return 1
    fi
    if [ "$target_port" -lt 1 -o "$target_port" -gt 65535 ]; then
        vecho "NFSv4 server entry check: target port out of range (port='$target_port')."
        return 1
    fi

    if [ ! -r "$NFSFS_SERVERS_FILE" ]; then
        vecho "NFSv4 server entry check: $NFSFS_SERVERS_FILE not readable; skipping guard."
        return 1
    fi

    target_port_hex=$(printf "%x" "$target_port")
    target_ip_hex=$(ip_to_hex "$target_ip")

    if [ -z "$target_ip_hex" ]; then
        vecho "NFSv4 server entry check: failed to convert $target_ip to hex; skipping guard."
        return 1
    fi

    vecho "NFSv4 server entry check: looking for $target_ip:$target_port."

    local line
    while IFS= read -r line; do
        if [ -z "$line" ]; then
            continue
        fi
        if [[ "$line" == NV* || "$line" == \#* ]]; then
            continue
        fi

        IFS=' ' read -ra fields <<< "$line"
        if [ ${#fields[@]} -lt 4 ]; then
            continue
        fi

        local version_field=${fields[0]}
        local server_field=${fields[1]}
        local port_field=${fields[2]}

        # Only match NFSv4 entries.
        if [ "$version_field" != "v4" ]; then
            continue
        fi

        # /proc/fs/nfsfs/servers uses hex format for server and port.
        if [ "${server_field,,}" != "$target_ip_hex" ]; then
            continue
        fi

        if [ "${port_field,,}" = "$target_port_hex" ]; then
            vecho "NFSv4 server entry check: matched entry (server=$server_field, port=$port_field)."
            return 0
        fi
    done < "$NFSFS_SERVERS_FILE"
    return 1
}

#
# Kill stunnel process and clean up stunnel files generated by aznfs mount helper
#
cleanup_stunnel_files()
{
    local l_conf=$1
    local l_log=$2
    local l_pid=$3
    local accept_port

    # Kill stunnel process first.
    pid=$(cat $l_pid)
    accept_port=$(cat $l_conf | grep accept | cut -d ':' -f 2)
    pecho "killing stunnel process with pid: $pid on port: $accept_port"
    kill -9 $pid
    if [ $? -ne 0 ]; then
        eecho "Unable to kill stunnel process $pid!"
    fi

    # Cleanup stunnel files
    rm $l_log
    if [ $? -ne 0 ]; then
        eecho "[FATAL] Unable to delete stunnel log file $l_log!"
    fi

    rm $l_pid
    if [ $? -ne 0 ]; then
        eecho "[FATAL] Unable to delete stunnel pid file $l_pid!"
    fi

    chattr -if $l_conf
    rm $l_conf
    if [ $? -ne 0 ]; then
        eecho "[FATAL] Unable to delete stunnel conf file $l_conf!"
    fi
}

#
# Delete entry from MOUNTMAPv4.
#
ensure_mountmapv4_not_exist()
{
    #
    # If watchdog wants to delete the entry only if MOUNTMAPv4 has not changed since
    # watchdog looked up, honour that.
    #
    local mountmap_mtime="$2"
    if [ -n "$mountmap_mtime" ]; then
        local mtime=$(stat -c%Y $MOUNTMAPv4)
        if [ "$mtime" != "$mountmap_mtime" ]; then
            eecho "[$1] Refusing to remove from ${MOUNTMAPv4} as $mtime != $mountmap_mtime!"
            return 1
        fi
    fi

    chattr -f -i $MOUNTMAPv4
    #
    # We overwrite the file instead of inplace update by sed as that has a
    # very bad side-effect of creating a new MOUNTMAPv4 file. This breaks
    # any locking that we dependent on the old file.
    #
    out=$(sed "\%^${1}$%d" $MOUNTMAPv4)
    ret=$?
    if [ $ret -eq 0 ]; then
        #
        # If this echo fails then MOUNTMAPv4 could be truncated.
        #
        echo "$out" > $MOUNTMAPv4
        ret=$?
        out=
        if [ $ret -ne 0 ]; then
            eecho "*** [FATAL] MOUNTMAPv4 may be in inconsistent state, contact Microsoft support ***"
        fi
    fi

    chattr -f +i $MOUNTMAPv4

    if [ $ret -ne 0 ]; then
        eecho "[$1] failed to remove from ${MOUNTMAPv4}!"
        return 1
    fi

    pecho "[$1] removed from ${MOUNTMAPv4} successfully!"

    # Return the mtime after our mods.
    echo $(stat -c%Y $MOUNTMAPv4)
}

cleanup_mount()
{
    local l_conf=$1
    local l_log=$2
    local l_pid=$3
    local line=$4

    # Need to lock MOUNTMAPv4 since the mountscript could modify them as well.
    exec {fd2}<$MOUNTMAPv4
    flock -e $fd2

    # Delete IFF mountmap is not changed since we read it above.
    l_mtime=$(ensure_mountmapv4_not_exist "$line" "$mtime_mountmap")

    #
    # Update mountmap mtime in case of successful updation of MOUNTMAPv4,
    # so that we can distinguish between MOUNTMAPv4 mtime changing because
    # of our action or some mount helper changing it. In the former case
    # it's safe to update the MOUNTMAPv4, so update mtime_mountmap to the
    # mtime after this update.
    #
    if [ $? -eq 0 ]; then
        mtime_mountmap=$l_mtime
    else
        # If the mountmap file is changed since we read it, we need to read it again - don't modify anything.
        eecho "Failed to delete entry from ${MOUNTMAPv4}! Entry: [$line]"
        flock -u $fd2
        exec {fd2}<&-
        return 1
    fi

    cleanup_stunnel_files $l_conf $l_log $l_pid

    flock -u $fd2
    exec {fd2}<&-
    return 0
}

#
# Process NFSv4 non-TLS DNAT-based mounts.
# Monitors MOUNTMAPv4NOTLS entries for:
#   1. Unmounted shares → cleanup DNAT rules and mountmap entries.
#   2. Missing DNAT rules → recreate them.
#   3. Backend failure → failover to another ZRS IP.
#
process_nfsv4_notls_mounts()
{
    if [ ! -f "$MOUNTMAPv4NOTLS" ]; then
        return
    fi

    # Read mountmap under lock.
    exec {fd_notls}<$MOUNTMAPv4NOTLS
    flock -e $fd_notls
    mtime_mountmap_notls=$(stat -c%Y $MOUNTMAPv4NOTLS)
    IFS=$'\n' notls_lines=$(cat $MOUNTMAPv4NOTLS)
    flock -u $fd_notls
    exec {fd_notls}<&-

    if [ -z "$notls_lines" ]; then
        return
    fi

    # Get current NFS mounts (done after reading mountmap for consistency).
    local findmnt_notls=$(findmnt --raw --noheading -o SOURCE,TARGET -t nfs,nfs4 2>/dev/null)

    for line in $notls_lines; do
        if [ -z "$line" ]; then
            continue
        fi

        # MOUNTMAPv4NOTLS format: hostname localip storageip
        IFS=" " read l_host l_ip l_nfsip <<< "$line"

        if [ -z "$l_host" -o -z "$l_ip" -o -z "$l_nfsip" ]; then
            wecho "[FATAL] Deleting invalid line in $MOUNTMAPv4NOTLS: [$line]!"
            l_mtime=$(ensure_mountmap_not_exist "$MOUNTMAPv4NOTLS" "$line" "$mtime_mountmap_notls")
            [ $? -eq 0 ] && mtime_mountmap_notls=$l_mtime
            continue
        fi

        #
        # Check 1: Is the mount still active?
        # If no mount references this proxy IP, clean up.
        #
        if ! echo "$findmnt_notls" | grep "$l_ip" > /dev/null 2>&1; then
            # Check kernel NFS server entries before cleanup (k8s namespace refs).
            if is_nfs_server_active_for_target "$l_ip" "2049"; then
                vecho "NFS server entry still active for $l_ip:2049; skipping cleanup for [$line]."
                continue
            fi

            pecho "No mounted shares for host $l_host with proxy IP $l_ip, cleaning up [$line]."
            l_mtime=$(ensure_mountmap_not_exist "$MOUNTMAPv4NOTLS" "$line" "$mtime_mountmap_notls")
            [ $? -eq 0 ] && mtime_mountmap_notls=$l_mtime
            continue
        fi

        #
        # Check 2: Verify DNAT rule still exists (safety net).
        #
        verify_iptable_entry "$l_ip" "$l_nfsip"

        #
        # Layer 1: Conntrack monitoring (passive, every 5s).
        # Detects stuck SYN_SENT or UNREPLIED entries on port 2049.
        #
        local failure_detected=false
        if reconcile_conntrack_v4 "$l_ip" "$l_nfsip"; then
            failure_detected=true
            wecho "Conntrack stuck entries detected for $l_host [$l_ip -> $l_nfsip]"
        fi

        #
        # Layer 2: Active TCP probe (every HEALTH_CHECK_FREQUENCY seconds).
        # Sends one SYN packet to the real storage IP on port 2049.
        #
        if [ $epoch_now -ge ${next_health_check_epoch:-0} ]; then
            if ! is_ip_port_reachable $l_nfsip 2049; then
                failure_detected=true
                wecho "Storage IP $l_nfsip is unreachable on port 2049 for $l_host"
            fi
        fi

        #
        # Layer 3: On failure, resolve DNS and failover to a healthy IP.
        #
        if [ "$failure_detected" == "true" ]; then
            pecho "Failure detected for $l_host [$l_ip -> $l_nfsip], attempting failover..."

            new_ip=$(resolve_ipv4 "$l_host" "false" 2049 "$l_nfsip")

            if [ $? -ne 0 ]; then
                eecho "Failed to resolve $l_host during failover: $new_ip"
                continue
            fi

            if [ "$new_ip" == "$l_nfsip" ]; then
                # DNS still returns the same (dead) IP, or it's the only one reachable.
                wecho "No alternative IP found for $l_host (DNS returned same IP $l_nfsip)."
                continue
            fi

            pecho "IP for $l_host changed [$l_nfsip -> $new_ip], updating DNAT."

            if update_mountmap_entry "$MOUNTMAPv4NOTLS" "$line" "$l_host $l_ip $new_ip"; then
                pecho "Failover complete for $l_host [$l_nfsip -> $new_ip]"

                # Update mtime since we modified the file.
                mtime_mountmap_notls=$(stat -c%Y $MOUNTMAPv4NOTLS)

                # Force NFS client to reconnect through new DNAT rule.
                local mount_target=$(echo "$findmnt_notls" | grep "$l_ip" | awk '{print $2}' | head -1)
                if [ -n "$mount_target" ]; then
                    ping_new_endpoint "$mount_target" &
                fi
            else
                eecho "Failed to update DNAT for $l_host [$l_nfsip -> $new_ip]!"
            fi
        fi
    done

    # Update health check timer (shared across all entries).
    if [ $epoch_now -ge ${next_health_check_epoch:-0} ]; then
        next_health_check_epoch=$(expr $epoch_now + $HEALTH_CHECK_FREQUENCY)
    fi
}

process_nfsv4_mounts()
{
    local l_conf
    local l_log
    local l_pid
    local l_checksumhash

    epoch_now=$(date +%s)

    #
    # Go over all lines in MOUNTMAPv4 and check them for two things:
    # 1. Is that entry still in use by at least one aznfs mount, if not remove the entry.
    # 2. Is stunnel process running?
    #
    # We store the mtime of MOUNTMAPv4 while inside the lock so that if any mount helper process
    # updates it after this we will skip modification for sake of safety. We will come to it
    # in the next iteration when it's safer.
    #
    exec {fd}<$MOUNTMAPv4
    flock -e $fd
    mtime_mountmap=$(stat -c%Y $MOUNTMAPv4)
    IFS=$'\n' lines=$(cat $MOUNTMAPv4)
    flock -u $fd
    exec {fd}<&-

    #
    # findmnt must be done after reading MOUNTMAPv4 so that if we come accross a
    # MOUNTMAPv4 entry whose all nfs file shares are unmounted, we know
    # for sure that it's not in use by any mount and can be removed.
    #
    findmnt=$(findmnt | grep 'nfs4\|$LOCALHOST' 2>&1)

    #
    # For no matching mounts also, findmnt exits with a failure return, so check
    # for both exit status and non-empty error o/p.
    #
    if [ $? -ne 0 -a -n "$findmnt" ]; then
        eecho "${findmnt}."
        eecho "[FATAL] findmnt failed unexpectedly!"
        eecho "[FATAL] aznfswatchdogv4 service is exiting, will not monitor Azure NFS file shares."
        eecho "[FATAL] Please contact Microsoft support before using any NFS File shares."
        # This usually indicates some non-transient issue, bail out.
        exit 1
    fi

    if [ -z "$NETSTATCOMMAND" ]; then
        eecho "[FATAL] No socket statistics command (netstat or ss) found! Aznfswatchdogv4 service is exiting. Please contact Microsoft support"
        exit 1
    fi

    for line in $lines; do
        if [ -z "$line" ]; then
            continue
        fi

        #
        # MOUNTMAPv4 line format:
        #   New (8 fields): <hostname>;<IP>;<stunnel.conf>;<stunnel.log>;<stunnel.pid>;<checksum>;<status>;<timeout>
        #   Old (7 fields): <IP>;<stunnel.conf>;<stunnel.log>;<stunnel.pid>;<checksum>;<status>;<timeout>
        # Detect by field count to maintain backward compatibility with pre-existing mounts.
        #
        local field_count=$(echo "$line" | awk -F';' '{print NF}')
        if [ "$field_count" -ge 8 ]; then
            IFS=";" read l_host l_ip l_conf l_log l_pid l_checksumhash l_status l_timeout <<< "$line"
        else
            # Old format (7 fields): no hostname, failover not possible.
            l_host=""
            IFS=";" read l_ip l_conf l_log l_pid l_checksumhash l_status l_timeout <<< "$line"
        fi

        if [ -z "$l_ip" -o -z "$l_conf" -o -z "$l_pid" ]; then
            wecho "[FATAL] Deleting invalid line in $MOUNTMAPv4: [$line]!"
            exec {fd2}<$MOUNTMAPv4
            flock -e $fd2
            l_mtime=$(ensure_mountmapv4_not_exist "$line")
            [ $? -eq 0 ] && mtime_mountmap=$l_mtime
            flock -u $fd2
            exec {fd2}<&-
            continue
        fi

        # Skip if the status is waiting, which means the mountscript is still processing the mount.
        if [ "$l_status" == "waiting" ]; then
            # vecho "Skipping entry with status 'waiting': [$line]"
            if [[ $l_timeout -ge $(date +%s) ]]; then
                # Timeout is in future, skip this entry.
                # If a mount entry stays in the 'waiting' state for a long time (grater than the mount timeout), it's safe to clean it up.
                # vecho "Timeout is in future, skipping entry with status 'waiting': [$line]"
                continue
            fi
        fi

        accept_port=$(cat $l_conf | grep accept | cut -d ':' -f 2)
        # vecho "accept_port: $accept_port"

        #
        # Delete entry from MOUNTMAPv4 if there are no mounted shares on that host.
        # As long as we have at least one mount using the MOUNTMAPv4 entry, we leave
        # it around.
        #
        if ! echo "$findmnt" | grep "$accept_port" >/dev/null; then
            vecho "findmnt shows no mount for accept_port=$accept_port (line=[$line])"

            # Some k8s pods may still hold mount namespace refs even after the
            # host mount is gone. Skip cleanup if kernel NFS server refs are
            # still active. For waiting/failed entries, cleanup to kill stunnel.
            if [ "$l_status" == "mounted" ]; then
                if is_nfs_server_active_for_target "$LOCALHOST" "$accept_port"; then
                    pecho "NFS server entry still active for $LOCALHOST:$accept_port; skipping cleanup for [$line]."
                    continue
                fi
            fi

            pecho "No mounted shares for host $l_ip with accept port $accept_port, deleting from ${MOUNTMAPv4} [$line]."

            cleanup_mount $l_conf $l_log $l_pid $line
            continue
        else
            # vecho "Mounted shares found for host $l_ip with accept port $accept_port."

            # Check if checksumHash for stunnel.conf file has changed.
            # Customers should not modify stunnel.conf files created by aznfs mount helper.
            checksumHash=`cksum $l_conf | awk '{print $1}'`
            if [ $? -ne 0 ]; then
                eecho "Failed to get the checksum hash of file: '${l_conf}'!"
            fi

            if [ $checksumHash != $l_checksumhash ]; then
                eecho "'${l_conf}' file has modified!"
                eecho "It's not recommended to modify '${l_conf}' file created by aznfs mount helper!"
                eecho "watchdog service will do cleanup, kill stunnel process with pid:$(cat $l_pid) and remove '${l_conf}'; '${l_log}'; '${l_pid}'!"
                eecho "Please remount the shares from ${l_ip} using aznfs mount helper."

                cleanup_mount $l_conf $l_log $l_pid $line
                continue
            fi

            is_stunnel_running=$($NETSTATCOMMAND -anp | grep stunnel | grep `cat $l_pid`)
            if [ -z "$is_stunnel_running" ]; then
                vecho "Watchdog: stunnel is not running! Restarting the stunnel"

                stunnel_status=$(stunnel $l_conf 2>&1)
                if [ -n "$stunnel_status" ]; then
                    used_port=$(cat $l_conf | grep accept | cut -d: -f2)
                    is_binding_error=$(echo $stunnel_status | grep "$LOCALHOST:$used_port: Address already in use")
                    is_caroot_cert_failure=$(echo $stunnel_status | grep "certificate verify failed")
                    if [ -n "$is_binding_error" ]; then
                        eecho "[FATAL] Restarting stunnel failed.$used_port port is already being used by other process.!"
                        eecho "It's recommended to unmount all shares from $l_ip and then remount shares using aznfs mount helper!"
                    elif [ -n "$is_caroot_cert_failure" ]; then
                        eecho "[FATAL] Restarting stunnel failed. CA root certificate is either missing or is unable to authenticate TLS server certificate."
                        eecho "Please download the CA root certificate from https://learn.microsoft.com/en-us/azure/security/fundamentals/azure-ca-details"
                        eecho "It's recommended to unmount all shares from $l_ip and then remount shares using aznfs mount helper!"
                    else
                        eecho "[FATAL] watchdog service is unable to start stunnel process for '${l_conf}'!"
                        eecho "Please check the stunnel logs in $l_log for more details."
                        eecho "It's recommended to unmount all shares from $l_ip and then remount shares using aznfs mount helper!"
                    fi
                fi
            fi

            #
            # ZRS failover detection for public endpoint TLS mounts.
            # Skip for private endpoints — Azure handles failover internally.
            #
            if ! is_private_ip "$l_ip" && [ -n "$l_host" ]; then
                local tls_failure_detected=false

                # TCP probe the real storage IP (every HEALTH_CHECK_FREQUENCY seconds).
                if [ $epoch_now -ge ${next_tls_health_check_epoch:-0} ]; then
                    if ! is_ip_port_reachable $l_ip 2049; then
                        tls_failure_detected=true
                        wecho "Storage IP $l_ip is unreachable on port 2049 for TLS mount $l_host"
                    fi
                fi

                if [ "$tls_failure_detected" == "true" ]; then
                    pecho "TLS failure detected for $l_host [$l_ip], attempting stunnel failover..."

                    new_ip=$(resolve_ipv4 "$l_host" "false" 2049 "$l_ip")

                    if [ $? -ne 0 ]; then
                        eecho "Failed to resolve $l_host during TLS failover: $new_ip"
                    elif [ "$new_ip" == "$l_ip" ]; then
                        wecho "No alternative IP found for $l_host (DNS returned same IP $l_ip)."
                    else
                        if failover_stunnel "$l_host" "$l_ip" "$l_conf" "$l_log" "$l_pid" "$l_checksumhash" "$line" "$new_ip"; then
                            # Force NFS client to reconnect through restarted stunnel.
                            local tls_mount_target=$(echo "$findmnt" | grep "$accept_port" | awk '{print $NF}' | head -1)
                            if [ -n "$tls_mount_target" ]; then
                                ping_new_endpoint "$tls_mount_target" &
                            fi
                        fi
                    fi
                fi
            fi
        fi
    done

    # Update TLS health check timer.
    if [ $epoch_now -ge ${next_tls_health_check_epoch:-0} ]; then
        next_tls_health_check_epoch=$(expr $epoch_now + $HEALTH_CHECK_FREQUENCY)
    fi
}

# Load common aznfs helpers.
AZNFS_VERSION=4
. /opt/microsoft/aznfs/common.sh

vecho "Starting aznfswatchdog for NFSv4..."

# Detect and log distro, bash and AZNFS-mount version
log_version_info

if ! chattr -f +i $MOUNTMAPv4; then
    wecho "chattr does not work for ${MOUNTMAPv4}!"
fi

if ! chattr -f +i $MOUNTMAPv4NOTLS; then
    wecho "chattr does not work for ${MOUNTMAPv4NOTLS}!"
fi

while :; do
    sleep $MONITOR_INTERVAL_SECS
    epoch_now=$(date +%s)
    process_nfsv4_mounts
    process_nfsv4_notls_mounts
done
