llvm/.github/workflows/libcxx-restart-preempted-jobs.yaml

name: Restart Preempted Libc++ Workflow

# The libc++ builders run on preemptable VMs, which can be shutdown at any time.
# This workflow identifies when a workflow run was canceled due to the VM being preempted,
# and restarts the workflow run.

# We identify a canceled workflow run by checking the annotations of the check runs in the check suite,
# which should contain the message "The runner has received a shutdown signal."

# Note: If a job is both preempted and also contains a non-preemption failure, we do not restart the workflow.

on:
  workflow_run:
    workflows: [Build and Test libc\+\+]
    types:
      - completed

permissions:
  contents: read

jobs:
  restart:
    if: github.repository_owner == 'llvm' && (github.event.workflow_run.conclusion == 'failure' || github.event.workflow_run.conclusion == 'cancelled')
    name: "Restart Job"
    permissions:
      statuses: read
      checks: write
      actions: write
    runs-on: ubuntu-latest
    steps:
      - name: "Restart Job"
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
        with:
          script: |
            const failure_regex = /Process completed with exit code 1./
            const preemption_regex = /The runner has received a shutdown signal/

            const wf_run = context.payload.workflow_run
            core.notice(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)


            async function create_check_run(conclusion, message) {
                // Create a check run on the given workflow run to indicate if
                // we are restarting the workflow or not.
                if (conclusion != 'success' && conclusion != 'skipped' && conclusion != 'neutral') {
                  core.setFailed('Invalid conclusion: ' + conclusion)
                }
                await github.rest.checks.create({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
                    name: 'Restart Preempted Job',
                    head_sha: wf_run.head_sha,
                    status: 'completed',
                    conclusion: conclusion,
                    output: {
                      title: 'Restarted Preempted Job',
                      summary: message
                    }
                })
            }

            console.log('Listing check runs for suite')
            const check_suites = await github.rest.checks.listForSuite({
              owner: context.repo.owner,
              repo: context.repo.repo,
              check_suite_id: context.payload.workflow_run.check_suite_id,
              per_page: 100 // FIXME: We don't have 100 check runs yet, but we should handle this better.
            })

            check_run_ids = [];
            for (check_run of check_suites.data.check_runs) {
              console.log('Checking check run: ' + check_run.id);
              if (check_run.status != 'completed') {
                console.log('Check run was not completed. Skipping.');
                continue;
              }
              if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') {
                console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.');
                continue;
              }
              check_run_ids.push(check_run.id);
            }

            has_preempted_job = false;

            for (check_run_id of check_run_ids) {
              console.log('Listing annotations for check run: ' + check_run_id);

              annotations = await github.rest.checks.listAnnotations({
                owner: context.repo.owner,
                repo: context.repo.repo,
                check_run_id: check_run_id
              })

              for (annotation of annotations.data) {
                if (annotation.annotation_level != 'failure') {
                  continue;
                }

                const preemption_match = annotation.message.match(preemption_regex);

                if (preemption_match != null) {
                  console.log('Found preemption message: ' + annotation.message);
                  has_preempted_job = true;
                }

                const failure_match = annotation.message.match(failure_regex);
                if (failure_match != null) {
                  // We only want to restart the workflow if all of the failures were due to preemption.
                  // We don't want to restart the workflow if there were other failures.
                  core.notice('Choosing not to rerun workflow because we found a non-preemption failure' +
                    'Failure message: "' + annotation.message + '"');
                  await create_check_run('skipped', 'Choosing not to rerun workflow because we found a non-preemption failure\n'
                    + 'Failure message: ' + annotation.message)
                  return;
                }
              }
            }

            if (!has_preempted_job) {
              core.notice('No preempted jobs found. Not restarting workflow.');
              await create_check_run('neutral', 'No preempted jobs found. Not restarting workflow.')
              return;
            }

            core.notice("Restarted workflow: " + context.payload.workflow_run.id);
            await github.rest.actions.reRunWorkflowFailedJobs({
                owner: context.repo.owner,
                repo: context.repo.repo,
                run_id: context.payload.workflow_run.id
              })
            await create_check_run('success', 'Restarted workflow run due to preempted job')