Skip to content

Watchdog - Multi Workflow Health Monitor #11691

Watchdog - Multi Workflow Health Monitor

Watchdog - Multi Workflow Health Monitor #11691

Workflow file for this run

name: Watchdog - Monitor gha-prod-workflow Status
on:
schedule:
- cron: "*/10 * * * *" # every 10 minutes
workflow_dispatch:
jobs:
monitor-blank:
runs-on: ubuntu-latest
steps:
- name: Check queued runs of blank.yml
id: check
run: |
echo "Checking queued runs for blank.yml..."
# Get workflow runs for blank.yml with status=queued
runs=$(gh api repos/${{ github.repository }}/actions/workflows/blank.yml/runs \
--jq '.workflow_runs[] | select(.status=="queued") | {id: .id, run_number: .run_number, created_at: .created_at}' )
echo "Found runs:"
echo "$runs"
# Default: no stale jobs
echo "stale_found=false" >> $GITHUB_ENV
if [ -z "$runs" ]; then
echo "No queued runs found."
exit 0
fi
cutoff_yellow=$(date -u -d "30 minutes ago" +%s)
cutoff_red=$(date -u -d "60 minutes ago" +%s)
cutoff_severe=$(date -u -d "90 minutes ago" +%s)
stale_runs=""
while IFS= read -r run; do
id=$(echo "$run" | jq -r '.id')
num=$(echo "$run" | jq -r '.run_number')
created=$(echo "$run" | jq -r '.created_at')
created_ts=$(date -d "$created" +%s)
if [ "$created_ts" -lt "$cutoff_severe" ]; then
severity="severe"
stale_runs="$stale_runs$id,$num,$created,$severity"$'\n'
elif [ "$created_ts" -lt "$cutoff_red" ]; then
severity="red"
stale_runs="$stale_runs$id,$num,$created,$severity"$'\n'
elif [ "$created_ts" -lt "$cutoff_yellow" ]; then
severity="yellow"
stale_runs="$stale_runs$id,$num,$created,$severity"$'\n'
fi
done <<< "$(echo "$runs" | jq -c '.')"
if [ -n "$stale_runs" ]; then
echo "$stale_runs" > stale_runs.txt
echo "stale_found=true" >> $GITHUB_ENV
fi
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Create Issue if stale jobs
if: env.stale_found == 'true'
run: |
while IFS=, read -r id num created severity; do
[ -z "$id" ] && continue
run_url="https://github.com/${{ github.repository }}/actions/runs/$id"
if [ "$severity" = "severe" ]; then
title="🚨πŸ”₯ Severe Red Alert: Queued job stuck > 90 minutes (blank.yml) - Run ID: $id"
body="Workflow **blank.yml** (Run Number: $num) has been queued since $created and is still stuck (>90 minutes).\n\nπŸ”— [View Run in GitHub Actions]($run_url)"
label="watchdog,severe-red-alert"
elif [ "$severity" = "red" ]; then
title="πŸš¨πŸ›‘ Red Alert: Queued job stuck > 60 minutes (blank.yml) - Run ID: $id"
body="Workflow **blank.yml** (Run Number: $num) has been queued since $created and is still stuck (>60 minutes).\n\nπŸ”— [View Run in GitHub Actions]($run_url)"
label="watchdog,red-alert"
else
title="⚠️🟑 Yellow Alert: Queued job stuck > 30 minutes (blank.yml) - Run ID: $id"
body="Workflow **blank.yml** (Run Number: $num) has been queued since $created and is still stuck (>30 minutes).\n\nπŸ”— [View Run in GitHub Actions]($run_url)"
label="watchdog,yellow-alert"
fi
# Check if issue already exists for this Run ID + severity
existing_issue=$(gh issue list \
--repo ${{ github.repository }} \
--search "$title in:title state:open" \
--json number --jq '.[0].number')
if [ -z "$existing_issue" ]; then
echo "Creating new issue for Run ID: $id ($severity)"
gh issue create \
--repo ${{ github.repository }} \
--title "$title" \
--body "$body" \
--label "$label"
else
echo "Issue already exists for Run ID: $id ($severity) -> #$existing_issue"
fi
done < stale_runs.txt
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Close Issues if queue is healthy
if: env.stale_found == 'false'
run: |
# Find all open watchdog issues for blank.yml
issues=$(gh issue list \
--repo ${{ github.repository }} \
--search "Queued job stuck (blank.yml) in:title state:open" \
--json number,title --jq '.[] | .number')
if [ -z "$issues" ]; then
echo "No open watchdog issues found."
exit 0
fi
for issue_number in $issues; do
echo "Closing Issue #$issue_number"
gh issue close $issue_number --repo ${{ github.repository }} \
--comment "βœ… Corresponding workflow run is no longer queued. Closing watchdog alert."
done
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}