diff --git a/.github/workflows/check-workflow-run.yml b/.github/workflows/check-workflow-run.yml new file mode 100644 index 00000000000..7c3cfc9c76f --- /dev/null +++ b/.github/workflows/check-workflow-run.yml @@ -0,0 +1,41 @@ +--- +name: Check Workflow Run + +on: + workflow_run: + types: + - completed + workflows: + - CI + - Nightly + - Scheduled + +permissions: + actions: write + +jobs: + restart-failed-workflow-runs: + name: "Restart Workflow (ID: ${{ github.event.workflow_run.id }}; Attempt: ${{ github.event.workflow_run.run_attempt }})" + if: ${{ github.event.workflow_run.conclusion == 'failure' }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + id: setup-python + with: + python-version: "3.10" + + - name: Setup Python Tools Scripts + uses: ./.github/actions/setup-python-tools-scripts + + - name: Pretty Print The GH Actions Event + run: + tools ci print-gh-event + + - name: Restart Workflow + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + tools ci rerun-workflow diff --git a/.github/workflows/templates/check-workflow-run.yml.j2 b/.github/workflows/templates/check-workflow-run.yml.j2 new file mode 100644 index 00000000000..3c0ea2a8b59 --- /dev/null +++ b/.github/workflows/templates/check-workflow-run.yml.j2 @@ -0,0 +1,45 @@ +{#- This workflow will restart failed workflow runs. + We should stop using this workflow once we remove the flakyness from + Salt's test suite +-#} +--- +name: <{ workflow_name }> + +on: + workflow_run: + types: + - completed + workflows: + <%- for workflow in check_workflows %> + - <{ workflow }> + <%- endfor %> + +permissions: + actions: write + +jobs: + restart-failed-workflow-runs: + name: "Restart Workflow (ID: ${{ github.event.workflow_run.id }}; Attempt: ${{ github.event.workflow_run.run_attempt }})" + if: ${{ github.event.workflow_run.conclusion == 'failure' }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + id: setup-python + with: + python-version: "3.10" + + - name: Setup Python Tools Scripts + uses: ./.github/actions/setup-python-tools-scripts + + - name: Pretty Print The GH Actions Event + run: + tools ci print-gh-event + + - name: Restart Workflow + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + tools ci rerun-workflow diff --git a/tools/ci.py b/tools/ci.py index aeeb9effb68..96bbe0b2d3c 100644 --- a/tools/ci.py +++ b/tools/ci.py @@ -511,3 +511,88 @@ def transport_matrix(ctx: Context, distro_slug: str): _matrix.append({"transport": transport}) print(json.dumps(_matrix)) ctx.exit(0) + + +@ci.command( + name="rerun-workflow", +) +def rerun_workflow(ctx: Context): + """ + Re-run failed workflows, up to a maximum of 3 times. + + Only restarts workflows for which less than 25% of the jobs failed. + """ + gh_event_path = os.environ.get("GITHUB_EVENT_PATH") or None + if gh_event_path is None: + ctx.warn("The 'GITHUB_EVENT_PATH' variable is not set.") + ctx.exit(1) + + if TYPE_CHECKING: + assert gh_event_path is not None + + try: + gh_event = json.loads(open(gh_event_path).read()) + except Exception as exc: + ctx.error(f"Could not load the GH Event payload from {gh_event_path!r}:\n", exc) + ctx.exit(1) + + workflow_run = gh_event["workflow_run"] + ctx.info( + f"Processing Workflow ID {workflow_run['id']}, attempt {workflow_run['run_attempt']}..." + ) + if workflow_run["run_attempt"] >= 3: + ctx.info( + f"This workflow has failed for the past {workflow_run['run_attempt']} attempts. " + "Not re-running it." + ) + ctx.exit(0) + + run_id = str(workflow_run["id"]) + repository = workflow_run["repository"]["full_name"] + page = 1 + total = failed = 0 + # Get all jobs from workflow run to see how many failed + while True: + cmdline = [ + "gh", + "api", + "-H", + "Accept: application/vnd.github+json", + f"/repos/{repository}/actions/runs/{run_id}/jobs?filter=latest&per_page=100&page={page}", + ] + ret = ctx.run(*cmdline, capture=True, check=False) + if ret.returncode: + ctx.error("Failed to get the jobs for the workflow run") + ctx.exit(0) + + jobs = json.loads(ret.stdout.strip().decode())["jobs"] + if not jobs: + break + + for job in jobs: + total += 1 + if job["conclusion"] == "failure": + failed += 1 + page += 1 + + ctx.info(f"{failed} out of {total} jobs failed.") + if failed > total / 2: + ctx.info("More than half of the jobs failed. Not automatically restarting.") + ctx.exit(0) + + cmdline = [ + "gh", + "run", + "-R", + repository, + "rerun", + run_id, + "--failed", + ] + ctx.info(f"Running {' '.join(cmdline)!r} ...") + ret = ctx.run(*cmdline, check=False) + if ret.returncode: + ctx.error("Failed to re-run workflow") + else: + ctx.info("Restarted workflow successfully") + ctx.exit(0) diff --git a/tools/pre_commit.py b/tools/pre_commit.py index 909f29abaf7..b270c43f1c6 100644 --- a/tools/pre_commit.py +++ b/tools/pre_commit.py @@ -59,6 +59,9 @@ def generate_workflows(ctx: Context): "Scheduled": { "template": "scheduled.yml", }, + "Check Workflow Run": { + "template": "check-workflow-run.yml", + }, } env = Environment( block_start_string="<%", @@ -88,6 +91,9 @@ def generate_workflows(ctx: Context): "conclusion_needs": NeedsTracker(), "test_salt_needs": NeedsTracker(), } + if workflow_name == "Check Workflow Run": + check_workflows = [wf for wf in sorted(workflows) if wf != workflow_name] + context["check_workflows"] = check_workflows loaded_template = env.get_template(f"{template}.j2") rendered_template = loaded_template.render(**context) workflow_path.write_text(rendered_template.rstrip() + "\n")