83 lines
2.1 KiB
Bash
Executable File
83 lines
2.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
STACK_ENV="${STACK_ENV:-$ROOT_DIR/stack.env}"
|
|
STACK_ID="${GIA_STACK_ID:-${STACK_ID:-}}"
|
|
STACK_ID="$(echo "$STACK_ID" | tr -cs 'a-zA-Z0-9._-' '-' | sed 's/^-*//; s/-*$//')"
|
|
SLEEP_SECONDS="${WATCHDOG_SLEEP_SECONDS:-15}"
|
|
NTFY_TOPIC="${NTFY_TOPIC:-${NOTIFY_TOPIC:-}}"
|
|
NTFY_URL_BASE="${NTFY_URL_BASE:-https://ntfy.sh}"
|
|
HOST_TAG="${HOSTNAME:-$(hostname 2>/dev/null || echo unknown-host)}"
|
|
|
|
if [[ -f "$STACK_ENV" ]]; then
|
|
set -a
|
|
. "$STACK_ENV"
|
|
set +a
|
|
fi
|
|
|
|
name_with_stack() {
|
|
local base="$1"
|
|
if [[ -n "$STACK_ID" ]]; then
|
|
echo "${base}_${STACK_ID}"
|
|
else
|
|
echo "$base"
|
|
fi
|
|
}
|
|
|
|
notify() {
|
|
local title="$1"
|
|
local msg="$2"
|
|
if [[ -z "$NTFY_TOPIC" ]]; then
|
|
return 0
|
|
fi
|
|
if ! command -v curl >/dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
curl -sS -X POST "${NTFY_URL_BASE%/}/$NTFY_TOPIC" \
|
|
-H "Title: $title" \
|
|
-H "Tags: warning" \
|
|
-d "$msg" >/dev/null || true
|
|
}
|
|
|
|
CONTAINERS=(
|
|
"$(name_with_stack "gia")"
|
|
"$(name_with_stack "asgi_gia")"
|
|
"$(name_with_stack "ur_gia")"
|
|
"$(name_with_stack "scheduling_gia")"
|
|
"$(name_with_stack "codex_worker_gia")"
|
|
)
|
|
|
|
declare -A LAST_STATE
|
|
for name in "${CONTAINERS[@]}"; do
|
|
LAST_STATE["$name"]="unknown"
|
|
done
|
|
|
|
while true; do
|
|
for name in "${CONTAINERS[@]}"; do
|
|
running="false"
|
|
if inspect_out="$(podman inspect -f '{{.State.Running}}' "$name" 2>/dev/null)"; then
|
|
running="$(echo "$inspect_out" | tr -d '\n' | tr 'A-Z' 'a-z')"
|
|
fi
|
|
|
|
if [[ "$running" == "true" ]]; then
|
|
if [[ "${LAST_STATE[$name]}" != "up" ]]; then
|
|
notify "GIA recovered: $name" "[$HOST_TAG] container $name is now running"
|
|
fi
|
|
LAST_STATE["$name"]="up"
|
|
continue
|
|
fi
|
|
|
|
restart_out=""
|
|
if restart_out="$(podman restart "$name" 2>&1)"; then
|
|
LAST_STATE["$name"]="recovering"
|
|
notify "GIA restarted: $name" "[$HOST_TAG] container $name was not running and restart succeeded"
|
|
else
|
|
LAST_STATE["$name"]="down"
|
|
notify "GIA restart failed: $name" "[$HOST_TAG] restart failed for $name: $restart_out"
|
|
fi
|
|
done
|
|
|
|
sleep "$SLEEP_SECONDS"
|
|
done
|