Watchdog - Multi Workflow Health Monitor #11652
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Watchdog - Monitor gha-prod-workflow Status | |
| on: | |
| schedule: | |
| - cron: "*/10 * * * *" # every 10 minutes | |
| workflow_dispatch: | |
| jobs: | |
| monitor-blank: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check queued runs of blank.yml | |
| id: check | |
| run: | | |
| echo "Checking queued runs for blank.yml..." | |
| # Get workflow runs for blank.yml with status=queued | |
| runs=$(gh api repos/${{ github.repository }}/actions/workflows/blank.yml/runs \ | |
| --jq '.workflow_runs[] | select(.status=="queued") | {id: .id, run_number: .run_number, created_at: .created_at}' ) | |
| echo "Found runs:" | |
| echo "$runs" | |
| # Default: no stale jobs | |
| echo "stale_found=false" >> $GITHUB_ENV | |
| if [ -z "$runs" ]; then | |
| echo "No queued runs found." | |
| exit 0 | |
| fi | |
| cutoff_yellow=$(date -u -d "30 minutes ago" +%s) | |
| cutoff_red=$(date -u -d "60 minutes ago" +%s) | |
| cutoff_severe=$(date -u -d "90 minutes ago" +%s) | |
| stale_runs="" | |
| while IFS= read -r run; do | |
| id=$(echo "$run" | jq -r '.id') | |
| num=$(echo "$run" | jq -r '.run_number') | |
| created=$(echo "$run" | jq -r '.created_at') | |
| created_ts=$(date -d "$created" +%s) | |
| if [ "$created_ts" -lt "$cutoff_severe" ]; then | |
| severity="severe" | |
| stale_runs="$stale_runs$id,$num,$created,$severity"$'\n' | |
| elif [ "$created_ts" -lt "$cutoff_red" ]; then | |
| severity="red" | |
| stale_runs="$stale_runs$id,$num,$created,$severity"$'\n' | |
| elif [ "$created_ts" -lt "$cutoff_yellow" ]; then | |
| severity="yellow" | |
| stale_runs="$stale_runs$id,$num,$created,$severity"$'\n' | |
| fi | |
| done <<< "$(echo "$runs" | jq -c '.')" | |
| if [ -n "$stale_runs" ]; then | |
| echo "$stale_runs" > stale_runs.txt | |
| echo "stale_found=true" >> $GITHUB_ENV | |
| fi | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Create Issue if stale jobs | |
| if: env.stale_found == 'true' | |
| run: | | |
| while IFS=, read -r id num created severity; do | |
| [ -z "$id" ] && continue | |
| run_url="https://github.com/${{ github.repository }}/actions/runs/$id" | |
| if [ "$severity" = "severe" ]; then | |
| title="π¨π₯ Severe Red Alert: Queued job stuck > 90 minutes (blank.yml) - Run ID: $id" | |
| body="Workflow **blank.yml** (Run Number: $num) has been queued since $created and is still stuck (>90 minutes).\n\nπ [View Run in GitHub Actions]($run_url)" | |
| label="watchdog,severe-red-alert" | |
| elif [ "$severity" = "red" ]; then | |
| title="π¨π Red Alert: Queued job stuck > 60 minutes (blank.yml) - Run ID: $id" | |
| body="Workflow **blank.yml** (Run Number: $num) has been queued since $created and is still stuck (>60 minutes).\n\nπ [View Run in GitHub Actions]($run_url)" | |
| label="watchdog,red-alert" | |
| else | |
| title="β οΈπ‘ Yellow Alert: Queued job stuck > 30 minutes (blank.yml) - Run ID: $id" | |
| body="Workflow **blank.yml** (Run Number: $num) has been queued since $created and is still stuck (>30 minutes).\n\nπ [View Run in GitHub Actions]($run_url)" | |
| label="watchdog,yellow-alert" | |
| fi | |
| # Check if issue already exists for this Run ID + severity | |
| existing_issue=$(gh issue list \ | |
| --repo ${{ github.repository }} \ | |
| --search "$title in:title state:open" \ | |
| --json number --jq '.[0].number') | |
| if [ -z "$existing_issue" ]; then | |
| echo "Creating new issue for Run ID: $id ($severity)" | |
| gh issue create \ | |
| --repo ${{ github.repository }} \ | |
| --title "$title" \ | |
| --body "$body" \ | |
| --label "$label" | |
| else | |
| echo "Issue already exists for Run ID: $id ($severity) -> #$existing_issue" | |
| fi | |
| done < stale_runs.txt | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Close Issues if queue is healthy | |
| if: env.stale_found == 'false' | |
| run: | | |
| # Find all open watchdog issues for blank.yml | |
| issues=$(gh issue list \ | |
| --repo ${{ github.repository }} \ | |
| --search "Queued job stuck (blank.yml) in:title state:open" \ | |
| --json number,title --jq '.[] | .number') | |
| if [ -z "$issues" ]; then | |
| echo "No open watchdog issues found." | |
| exit 0 | |
| fi | |
| for issue_number in $issues; do | |
| echo "Closing Issue #$issue_number" | |
| gh issue close $issue_number --repo ${{ github.repository }} \ | |
| --comment "β Corresponding workflow run is no longer queued. Closing watchdog alert." | |
| done | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |