MFlowCode · sbryngelson · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
@@ -37,3 +37,6 @@ checkall: &checkall
   - *tests
   - *scripts
   - *yml
+
+cases_py:
+  - 'toolchain/mfc/test/cases.py'
@@ -8,7 +8,8 @@
 # Try normal cleanup; if it fails, escalate to cache nuke.
 _retry_clean() {
     local clean_cmd="$1"
-    if eval "$clean_cmd" 2>/dev/null; then
+    # shellcheck disable=SC2086  # word splitting is intentional here
+    if $clean_cmd 2>/dev/null; then
         return 0
     fi
     echo "  Normal cleanup failed."

@@ -8,7 +8,7 @@
 PASSTHROUGH=""
 for arg in "$@"; do
     case "$arg" in
-        --test-all) PASSTHROUGH="$PASSTHROUGH --test-all" ;;
+        --test-all|--single|--debug|--gcov|--only-changes) PASSTHROUGH="$PASSTHROUGH $arg" ;;
     esac
 done
 

@@ -0,0 +1,37 @@
+#!/bin/bash
+# Run monitor_slurm_job.sh and recover if the monitor is killed (e.g. SIGKILL
+# from the runner OS) before the SLURM job completes.  When the monitor exits
+# non-zero, sacct is used to verify the job's actual final state; if the SLURM
+# job succeeded we exit 0 so the CI step is not falsely marked as failed.
+#
+# Usage: run_monitored_slurm_job.sh <job_id> <output_file>
+
+set -euo pipefail
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <job_id> <output_file>"
+    exit 1
+fi
+
+job_id="$1"
+output_file="$2"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+monitor_exit=0
+bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
+
+if [ "$monitor_exit" -ne 0 ]; then
+    echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
+    # Give the SLURM epilog time to finalize if the job just finished
+    sleep 30
+    final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
+    final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+    echo "Final SLURM state=$final_state exit=$final_exit"
+    if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
+        echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
+    else
+        echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
+        exit 1
+    fi
+fi
@@ -14,12 +14,18 @@ device="$2"
 interface="$3"
 cluster="$4"
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
 echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
 cd "$dir"
 
-# Submit and monitor job (submit.sh auto-detects bench mode from script name)
-bash .github/workflows/$cluster/submit.sh \
-    .github/workflows/$cluster/bench.sh "$device" "$interface"
+# Always use the PR's submit.sh so both master and PR builds benefit from the
+# run_monitored_slurm_job.sh SIGKILL recovery wrapper.  The bench script is
+# still resolved relative to the current directory (master/ or pr/) so the
+# correct branch code is benchmarked.  SLURM_SUBMIT_DIR ensures the job runs
+# in the right directory regardless of which submit.sh is invoked.
+PR_SUBMIT="${SCRIPT_DIR}/../workflows/${cluster}/submit.sh"
+bash "$PR_SUBMIT" .github/workflows/$cluster/bench.sh "$device" "$interface"
 
 # Verify the YAML output file was created
 job_slug="bench-$device-$interface"

@@ -88,7 +88,7 @@ jobs:
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
-    timeout-minutes: 480
+    timeout-minutes: 240
     steps:
       - name: Clone - PR
         uses: actions/checkout@v4

@@ -2,8 +2,11 @@
 
 source .github/scripts/bench-preamble.sh
 
+# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes.
+n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))
+
 if [ "$job_device" = "gpu" ]; then
     ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 else
-    ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
+    ./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 fi
@@ -20,10 +20,7 @@ build_opts="$gpu_opts"
 
 . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
 
-# Only set up build cache for test suite, not benchmarks
-if [ "$run_bench" != "bench" ]; then
-    source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
-fi
+rm -rf build
 
 source .github/scripts/retry-build.sh
 if [ "$run_bench" == "bench" ]; then

@@ -44,17 +44,10 @@ else
 fi
 
 # Select SBATCH params based on job type
-if [ "$job_type" = "bench" ]; then
-    sbatch_account="#SBATCH -A ENG160"
-    sbatch_time="#SBATCH -t 05:59:00"
-    sbatch_partition="#SBATCH -p extended"
-    sbatch_extra=""
-else
-    sbatch_account="#SBATCH -A CFD154"
-    sbatch_time="#SBATCH -t 01:59:00"
-    sbatch_partition="#SBATCH -p batch"
-    sbatch_extra="#SBATCH --qos=normal"
-fi
+sbatch_account="#SBATCH -A CFD154"
+sbatch_time="#SBATCH -t 01:59:00"
+sbatch_partition="#SBATCH -p batch"
+sbatch_extra="#SBATCH --qos=normal"
 
 shard_suffix=""
 if [ -n "$4" ]; then
@@ -85,6 +78,7 @@ job_device="$2"
 job_interface="$3"
 job_shard="$4"
 job_cluster="$cluster_name"
+export GITHUB_EVENT_NAME="$GITHUB_EVENT_NAME"
 
 . ./mfc.sh load -c $compiler_flag -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
 
@@ -102,5 +96,4 @@ fi
 
 echo "Submitted batch job $job_id"
 
-# Use resilient monitoring instead of sbatch -W
-bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
+bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"
@@ -9,12 +9,18 @@ if [ -n "$job_shard" ]; then
     shard_opts="--shard $job_shard"
 fi
 
+# Only prune tests on PRs; master pushes must run the full suite.
+prune_flag=""
+if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
+    prune_flag="--only-changes"
+fi
+
 if [ "$job_device" = "gpu" ]; then
     rdma_opts=""
     if [ "$job_cluster" = "frontier" ]; then
         rdma_opts="--rdma-mpi"
     fi
-    ./mfc.sh test -v -a $rdma_opts --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c $job_cluster
+    ./mfc.sh test -v -a $rdma_opts --max-attempts 3 $prune_flag -j $ngpus $device_opts $shard_opts -- -c $job_cluster
 else
-    ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu $shard_opts -- -c $job_cluster
+    ./mfc.sh test -v -a --max-attempts 3 $prune_flag -j 32 --no-gpu $shard_opts -- -c $job_cluster
 fi
@@ -2,6 +2,10 @@
 
 source .github/scripts/bench-preamble.sh
 
+# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes
+# (GNR nodes have 192 cores but nproc is too aggressive for build/bench).
+n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))
+
 tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
 currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
 mkdir -p $tmpbuild
@@ -15,10 +19,12 @@ else
     bench_opts="--mem 1"
 fi
 
+rm -rf build
+
 source .github/scripts/retry-build.sh
-RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1
+RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
 
-./mfc.sh bench $bench_opts -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
+./mfc.sh bench $bench_opts -j $n_jobs -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
 
 sleep 10
 rm -rf "$currentdir" || true

@@ -0,0 +1,23 @@
+#!/bin/bash
+set -e
+
+# Number of parallel jobs: use SLURM allocation or default to 24.
+# Cap at 64 to avoid overwhelming OpenMPI daemons and OS process limits with concurrent launches.
+NJOBS="${SLURM_CPUS_ON_NODE:-24}"
+if [ "$NJOBS" -gt 64 ]; then NJOBS=64; fi
+
+# Clean stale build artifacts: the self-hosted runner may have a cached
+# GPU build (e.g. --gpu mp) whose CMake flags are incompatible with gcov.
+./mfc.sh clean
+
+# Source retry_build() for NFS stale file handle resilience (3 attempts).
+source .github/scripts/retry-build.sh
+
+# Build MFC with gcov coverage instrumentation (CPU-only, gfortran).
+retry_build ./mfc.sh build --gcov -j 8
+
+# Run all tests in parallel, collecting per-test coverage data.
+# Each test gets an isolated GCOV_PREFIX directory so .gcda files
+# don't collide. Coverage is collected per-test after all tests finish.
+# --gcov is required so the internal build step preserves instrumentation.
+./mfc.sh test --build-coverage-cache --gcov -j "$NJOBS"
@@ -24,22 +24,22 @@ case "$script_basename" in
 esac
 
 sbatch_cpu_opts="\
-#SBATCH -p cpu-small               # partition
-#SBATCH --ntasks-per-node=24       # Number of cores per node required
-#SBATCH --mem-per-cpu=2G           # Memory per core\
+#SBATCH -p cpu-gnr                 # partition (full Granite Rapids node)
+#SBATCH --exclusive                # exclusive access to all cores
+#SBATCH -C graniterapids           # constrain to GNR architecture\
 "
 
 if [ "$job_type" = "bench" ]; then
     sbatch_gpu_opts="\
 #SBATCH -CL40S
-#SBATCH --ntasks-per-node=4       # Number of cores per node required
+#SBATCH --ntasks-per-node=4       # Number of MPI tasks per node required
 #SBATCH -G2\
 "
     sbatch_time="#SBATCH -t 04:00:00"
 else
     sbatch_gpu_opts="\
 #SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
-#SBATCH --ntasks-per-node=4       # Number of cores per node required
+#SBATCH --ntasks-per-node=4       # Number of MPI tasks per node required
 #SBATCH -G2\
 "
     sbatch_time="#SBATCH -t 03:00:00"
@@ -77,6 +77,7 @@ echo "Running in $(pwd):"
 job_slug="$job_slug"
 job_device="$2"
 job_interface="$3"
+export GITHUB_EVENT_NAME="$GITHUB_EVENT_NAME"
 
 . ./mfc.sh load -c p -m $2
 
@@ -94,6 +95,5 @@ fi
 
 echo "Submitted batch job $job_id"
 
-# Use resilient monitoring instead of sbatch -W
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
+bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"
@@ -3,21 +3,28 @@
 source .github/scripts/gpu-opts.sh
 build_opts="$gpu_opts"
 
-# Set up persistent build cache
-source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
+rm -rf build
 
 # Build with retry; smoke-test cached binaries to catch architecture mismatches
 # (SIGILL from binaries compiled on a different compute node).
 source .github/scripts/retry-build.sh
 RETRY_VALIDATE_CMD='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' \
     retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1
 
-n_test_threads=8
+# Use up to 64 parallel test threads on CPU (GNR nodes have 192 cores).
+# Cap at 64 to avoid overwhelming OpenMPI daemons and OS process limits with concurrent launches.
+n_test_threads=$(( SLURM_CPUS_ON_NODE > 64 ? 64 : ${SLURM_CPUS_ON_NODE:-8} ))
 
 if [ "$job_device" = "gpu" ]; then
     source .github/scripts/detect-gpus.sh
     device_opts="-g $gpu_ids"
     n_test_threads=$((ngpus * 2))
 fi
 
-./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix
+# Only prune tests on PRs; master pushes must run the full suite.
+prune_flag=""
+if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
+    prune_flag="--only-changes"
+fi
+
+./mfc.sh test -v --max-attempts 3 $prune_flag -a -j $n_test_threads $device_opts -- -c phoenix