Implement executing tasks
This commit is contained in:
82
scripts/quadlet/watchdog.sh
Executable file
82
scripts/quadlet/watchdog.sh
Executable file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
STACK_ENV="${STACK_ENV:-$ROOT_DIR/stack.env}"
|
||||
STACK_ID="${GIA_STACK_ID:-${STACK_ID:-}}"
|
||||
STACK_ID="$(echo "$STACK_ID" | tr -cs 'a-zA-Z0-9._-' '-' | sed 's/^-*//; s/-*$//')"
|
||||
SLEEP_SECONDS="${WATCHDOG_SLEEP_SECONDS:-15}"
|
||||
NTFY_TOPIC="${NTFY_TOPIC:-${NOTIFY_TOPIC:-}}"
|
||||
NTFY_URL_BASE="${NTFY_URL_BASE:-https://ntfy.sh}"
|
||||
HOST_TAG="${HOSTNAME:-$(hostname 2>/dev/null || echo unknown-host)}"
|
||||
|
||||
if [[ -f "$STACK_ENV" ]]; then
|
||||
set -a
|
||||
. "$STACK_ENV"
|
||||
set +a
|
||||
fi
|
||||
|
||||
name_with_stack() {
|
||||
local base="$1"
|
||||
if [[ -n "$STACK_ID" ]]; then
|
||||
echo "${base}_${STACK_ID}"
|
||||
else
|
||||
echo "$base"
|
||||
fi
|
||||
}
|
||||
|
||||
notify() {
|
||||
local title="$1"
|
||||
local msg="$2"
|
||||
if [[ -z "$NTFY_TOPIC" ]]; then
|
||||
return 0
|
||||
fi
|
||||
if ! command -v curl >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
curl -sS -X POST "${NTFY_URL_BASE%/}/$NTFY_TOPIC" \
|
||||
-H "Title: $title" \
|
||||
-H "Tags: warning" \
|
||||
-d "$msg" >/dev/null || true
|
||||
}
|
||||
|
||||
CONTAINERS=(
|
||||
"$(name_with_stack "gia")"
|
||||
"$(name_with_stack "asgi_gia")"
|
||||
"$(name_with_stack "ur_gia")"
|
||||
"$(name_with_stack "scheduling_gia")"
|
||||
"$(name_with_stack "codex_worker_gia")"
|
||||
)
|
||||
|
||||
declare -A LAST_STATE
|
||||
for name in "${CONTAINERS[@]}"; do
|
||||
LAST_STATE["$name"]="unknown"
|
||||
done
|
||||
|
||||
while true; do
|
||||
for name in "${CONTAINERS[@]}"; do
|
||||
running="false"
|
||||
if inspect_out="$(podman inspect -f '{{.State.Running}}' "$name" 2>/dev/null)"; then
|
||||
running="$(echo "$inspect_out" | tr -d '\n' | tr 'A-Z' 'a-z')"
|
||||
fi
|
||||
|
||||
if [[ "$running" == "true" ]]; then
|
||||
if [[ "${LAST_STATE[$name]}" != "up" ]]; then
|
||||
notify "GIA recovered: $name" "[$HOST_TAG] container $name is now running"
|
||||
fi
|
||||
LAST_STATE["$name"]="up"
|
||||
continue
|
||||
fi
|
||||
|
||||
restart_out=""
|
||||
if restart_out="$(podman restart "$name" 2>&1)"; then
|
||||
LAST_STATE["$name"]="recovering"
|
||||
notify "GIA restarted: $name" "[$HOST_TAG] container $name was not running and restart succeeded"
|
||||
else
|
||||
LAST_STATE["$name"]="down"
|
||||
notify "GIA restart failed: $name" "[$HOST_TAG] restart failed for $name: $restart_out"
|
||||
fi
|
||||
done
|
||||
|
||||
sleep "$SLEEP_SECONDS"
|
||||
done
|
||||
Reference in New Issue
Block a user