Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
a7e5e9b
Add gcov-based test pruning with file-level coverage cache
sbryngelson Mar 2, 2026
1087b9d
Fix post_process failures in coverage cache by applying output params
sbryngelson Mar 2, 2026
5c581f6
TEMP: strip ALWAYS_RUN_ALL to GPU macros only to exercise pruning in CI
sbryngelson Mar 2, 2026
28f2dc1
Fix review findings: phase labels, unused params, FileNotFoundError g…
sbryngelson Mar 2, 2026
f4b282a
Increase cache builder per-test timeout from 300s to 600s
sbryngelson Mar 2, 2026
3675860
Log failed test output in cache builder for easier debugging
sbryngelson Mar 2, 2026
ca70390
Fix Rich MarkupError crash when build output contains bracket paths
sbryngelson Mar 2, 2026
70ad8b8
Disable AVX-512 FP16 to fix build on Granite Rapids nodes
sbryngelson Mar 3, 2026
01aa768
Fix post_process failures in coverage cache builder
sbryngelson Mar 3, 2026
bb43adb
Fix Fortran namelist quoting bug and add .inp existence check
sbryngelson Mar 3, 2026
97cab67
Restore ALWAYS_RUN_ALL, unit tests, and remove duplicate use statement
sbryngelson Mar 3, 2026
c6a0071
Exclude example-based tests from coverage pruning
sbryngelson Mar 3, 2026
5c4eaab
Cap bench script parallelism at 64 to fix GNR node failures
sbryngelson Mar 3, 2026
16bbe6e
TEMP: exercise pruning in CI + fix missing sim coverage
sbryngelson Mar 3, 2026
b79f6c6
Pass GITHUB_EVENT_NAME to SLURM jobs for coverage pruning
sbryngelson Mar 4, 2026
187f3b8
Update coverage cache: 100% sim coverage with t_step_stop=1
sbryngelson Mar 4, 2026
3ab5bce
DIAG: add slug diagnostic logging to identify binary-not-found root c…
sbryngelson Mar 4, 2026
8a47aa3
Fix OOM in coverage cache: reduce test parallelism to 16 and remove d…
sbryngelson Mar 4, 2026
1d33a0a
Restore ALWAYS_RUN_ALL entries and remove TEMP Fortran use statement
sbryngelson Mar 4, 2026
8120009
Skip 1D_qbmm example test: formatted I/O field overflow on gfortran 12
sbryngelson Mar 4, 2026
160353b
Address PR review feedback: fix type annotation, dep detection, and e…
sbryngelson Mar 4, 2026
c4db356
Filter out build/staging paths from coverage cache
sbryngelson Mar 4, 2026
6d10fb7
Harden cache push target and add safety comments
sbryngelson Mar 4, 2026
66c792a
Reduce benchmark steps and switch Frontier bench to batch/normal QOS
sbryngelson Mar 5, 2026
b50e6ea
Use retry_build in coverage cache rebuild for NFS resilience
sbryngelson Mar 5, 2026
1bac876
Address PR review feedback: fix type annotation, dep detection, and e…
sbryngelson Mar 5, 2026
9719ea3
Fix coverage cache: remove SYSCHECK, short-circuit on failure, fix pr…
sbryngelson Mar 6, 2026
df97a8a
Clean up test output dirs after cache build to reduce NFS pressure
sbryngelson Mar 6, 2026
44e15ef
Remove persistent build cache for self-hosted test runners
sbryngelson Mar 6, 2026
a010a9a
Remove build cache from benchmark jobs on Phoenix and Frontier
sbryngelson Mar 6, 2026
2b52d57
Fix submit.sh to survive monitor SIGKILL by re-checking SLURM state
sbryngelson Mar 6, 2026
2caf95f
Extract monitor SIGKILL recovery into shared run_monitored_slurm_job.sh
sbryngelson Mar 6, 2026
baa49bf
Fix bench: use PR's submit.sh for master job to get SIGKILL recovery
sbryngelson Mar 6, 2026
a84cabc
Fix submit_and_monitor_bench.sh: define SCRIPT_DIR before use
sbryngelson Mar 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/file-filter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,6 @@ checkall: &checkall
- *tests
- *scripts
- *yml

cases_py:
- 'toolchain/mfc/test/cases.py'
3 changes: 2 additions & 1 deletion .github/scripts/retry-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
# Try normal cleanup; if it fails, escalate to cache nuke.
_retry_clean() {
local clean_cmd="$1"
if eval "$clean_cmd" 2>/dev/null; then
# shellcheck disable=SC2086 # word splitting is intentional here
if $clean_cmd 2>/dev/null; then
return 0
fi
echo " Normal cleanup failed."
Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/run-tests-with-retry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
PASSTHROUGH=""
for arg in "$@"; do
case "$arg" in
--test-all) PASSTHROUGH="$PASSTHROUGH --test-all" ;;
--test-all|--single|--debug|--gcov|--only-changes) PASSTHROUGH="$PASSTHROUGH $arg" ;;
esac
done

Expand Down
37 changes: 37 additions & 0 deletions .github/scripts/run_monitored_slurm_job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash
# Run monitor_slurm_job.sh and recover if the monitor is killed (e.g. SIGKILL
# from the runner OS) before the SLURM job completes. When the monitor exits
# non-zero, sacct is used to verify the job's actual final state; if the SLURM
# job succeeded we exit 0 so the CI step is not falsely marked as failed.
#
# Usage: run_monitored_slurm_job.sh <job_id> <output_file>

set -euo pipefail

if [ $# -ne 2 ]; then
echo "Usage: $0 <job_id> <output_file>"
exit 1
fi

job_id="$1"
output_file="$2"

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

monitor_exit=0
bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?

if [ "$monitor_exit" -ne 0 ]; then
echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
# Give the SLURM epilog time to finalize if the job just finished
sleep 30
final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
echo "Final SLURM state=$final_state exit=$final_exit"
if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
else
echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
exit 1
fi
fi
12 changes: 9 additions & 3 deletions .github/scripts/submit_and_monitor_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,18 @@ device="$2"
interface="$3"
cluster="$4"

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
cd "$dir"

# Submit and monitor job (submit.sh auto-detects bench mode from script name)
bash .github/workflows/$cluster/submit.sh \
.github/workflows/$cluster/bench.sh "$device" "$interface"
# Always use the PR's submit.sh so both master and PR builds benefit from the
# run_monitored_slurm_job.sh SIGKILL recovery wrapper. The bench script is
# still resolved relative to the current directory (master/ or pr/) so the
# correct branch code is benchmarked. SLURM_SUBMIT_DIR ensures the job runs
# in the right directory regardless of which submit.sh is invoked.
PR_SUBMIT="${SCRIPT_DIR}/../workflows/${cluster}/submit.sh"
bash "$PR_SUBMIT" .github/workflows/$cluster/bench.sh "$device" "$interface"

# Verify the YAML output file was created
job_slug="bench-$device-$interface"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ jobs:
runs-on:
group: ${{ matrix.group }}
labels: ${{ matrix.labels }}
timeout-minutes: 480
timeout-minutes: 240
steps:
- name: Clone - PR
uses: actions/checkout@v4
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/frontier/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

source .github/scripts/bench-preamble.sh

# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes.
n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))

if [ "$job_device" = "gpu" ]; then
./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
else
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
fi
5 changes: 1 addition & 4 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,7 @@ build_opts="$gpu_opts"

. ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")

# Only set up build cache for test suite, not benchmarks
if [ "$run_bench" != "bench" ]; then
source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
fi
rm -rf build

source .github/scripts/retry-build.sh
if [ "$run_bench" == "bench" ]; then
Expand Down
19 changes: 6 additions & 13 deletions .github/workflows/frontier/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,10 @@ else
fi

# Select SBATCH params based on job type
if [ "$job_type" = "bench" ]; then
sbatch_account="#SBATCH -A ENG160"
sbatch_time="#SBATCH -t 05:59:00"
sbatch_partition="#SBATCH -p extended"
sbatch_extra=""
else
sbatch_account="#SBATCH -A CFD154"
sbatch_time="#SBATCH -t 01:59:00"
sbatch_partition="#SBATCH -p batch"
sbatch_extra="#SBATCH --qos=normal"
fi
sbatch_account="#SBATCH -A CFD154"
sbatch_time="#SBATCH -t 01:59:00"
sbatch_partition="#SBATCH -p batch"
sbatch_extra="#SBATCH --qos=normal"

shard_suffix=""
if [ -n "$4" ]; then
Expand Down Expand Up @@ -85,6 +78,7 @@ job_device="$2"
job_interface="$3"
job_shard="$4"
job_cluster="$cluster_name"
export GITHUB_EVENT_NAME="$GITHUB_EVENT_NAME"

. ./mfc.sh load -c $compiler_flag -m $([ "$2" = "gpu" ] && echo "g" || echo "c")

Expand All @@ -102,5 +96,4 @@ fi

echo "Submitted batch job $job_id"

# Use resilient monitoring instead of sbatch -W
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"
10 changes: 8 additions & 2 deletions .github/workflows/frontier/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@ if [ -n "$job_shard" ]; then
shard_opts="--shard $job_shard"
fi

# Only prune tests on PRs; master pushes must run the full suite.
prune_flag=""
if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
prune_flag="--only-changes"
fi

if [ "$job_device" = "gpu" ]; then
rdma_opts=""
if [ "$job_cluster" = "frontier" ]; then
rdma_opts="--rdma-mpi"
fi
./mfc.sh test -v -a $rdma_opts --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c $job_cluster
./mfc.sh test -v -a $rdma_opts --max-attempts 3 $prune_flag -j $ngpus $device_opts $shard_opts -- -c $job_cluster
else
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu $shard_opts -- -c $job_cluster
./mfc.sh test -v -a --max-attempts 3 $prune_flag -j 32 --no-gpu $shard_opts -- -c $job_cluster
fi
10 changes: 8 additions & 2 deletions .github/workflows/phoenix/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

source .github/scripts/bench-preamble.sh

# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes
# (GNR nodes have 192 cores but nproc is too aggressive for build/bench).
n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))

tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
mkdir -p $tmpbuild
Expand All @@ -15,10 +19,12 @@ else
bench_opts="--mem 1"
fi

rm -rf build

source .github/scripts/retry-build.sh
RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1
RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1

./mfc.sh bench $bench_opts -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
./mfc.sh bench $bench_opts -j $n_jobs -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks

sleep 10
rm -rf "$currentdir" || true
Expand Down
23 changes: 23 additions & 0 deletions .github/workflows/phoenix/rebuild-cache.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash
set -e

# Number of parallel jobs: use SLURM allocation or default to 24.
# Cap at 64 to avoid overwhelming OpenMPI daemons and OS process limits with concurrent launches.
NJOBS="${SLURM_CPUS_ON_NODE:-24}"
if [ "$NJOBS" -gt 64 ]; then NJOBS=64; fi

# Clean stale build artifacts: the self-hosted runner may have a cached
# GPU build (e.g. --gpu mp) whose CMake flags are incompatible with gcov.
./mfc.sh clean

# Source retry_build() for NFS stale file handle resilience (3 attempts).
source .github/scripts/retry-build.sh

# Build MFC with gcov coverage instrumentation (CPU-only, gfortran).
retry_build ./mfc.sh build --gcov -j 8

# Run all tests in parallel, collecting per-test coverage data.
# Each test gets an isolated GCOV_PREFIX directory so .gcda files
# don't collide. Coverage is collected per-test after all tests finish.
# --gcov is required so the internal build step preserves instrumentation.
./mfc.sh test --build-coverage-cache --gcov -j "$NJOBS"
14 changes: 7 additions & 7 deletions .github/workflows/phoenix/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,22 @@ case "$script_basename" in
esac

sbatch_cpu_opts="\
#SBATCH -p cpu-small # partition
#SBATCH --ntasks-per-node=24 # Number of cores per node required
#SBATCH --mem-per-cpu=2G # Memory per core\
#SBATCH -p cpu-gnr # partition (full Granite Rapids node)
#SBATCH --exclusive # exclusive access to all cores
#SBATCH -C graniterapids # constrain to GNR architecture\
"

if [ "$job_type" = "bench" ]; then
sbatch_gpu_opts="\
#SBATCH -CL40S
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH --ntasks-per-node=4 # Number of MPI tasks per node required
#SBATCH -G2\
"
sbatch_time="#SBATCH -t 04:00:00"
else
sbatch_gpu_opts="\
#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH --ntasks-per-node=4 # Number of MPI tasks per node required
#SBATCH -G2\
"
sbatch_time="#SBATCH -t 03:00:00"
Expand Down Expand Up @@ -77,6 +77,7 @@ echo "Running in $(pwd):"
job_slug="$job_slug"
job_device="$2"
job_interface="$3"
export GITHUB_EVENT_NAME="$GITHUB_EVENT_NAME"

. ./mfc.sh load -c p -m $2

Expand All @@ -94,6 +95,5 @@ fi

echo "Submitted batch job $job_id"

# Use resilient monitoring instead of sbatch -W
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"
15 changes: 11 additions & 4 deletions .github/workflows/phoenix/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,28 @@
source .github/scripts/gpu-opts.sh
build_opts="$gpu_opts"

# Set up persistent build cache
source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
rm -rf build

# Build with retry; smoke-test cached binaries to catch architecture mismatches
# (SIGILL from binaries compiled on a different compute node).
source .github/scripts/retry-build.sh
RETRY_VALIDATE_CMD='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' \
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1

n_test_threads=8
# Use up to 64 parallel test threads on CPU (GNR nodes have 192 cores).
# Cap at 64 to avoid overwhelming OpenMPI daemons and OS process limits with concurrent launches.
n_test_threads=$(( SLURM_CPUS_ON_NODE > 64 ? 64 : ${SLURM_CPUS_ON_NODE:-8} ))

if [ "$job_device" = "gpu" ]; then
source .github/scripts/detect-gpus.sh
device_opts="-g $gpu_ids"
n_test_threads=$((ngpus * 2))
fi

./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix
# Only prune tests on PRs; master pushes must run the full suite.
prune_flag=""
if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
prune_flag="--only-changes"
fi

./mfc.sh test -v --max-attempts 3 $prune_flag -a -j $n_test_threads $device_opts -- -c phoenix
Loading
Loading