Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
b9da01e
Work around CCE 19.0.0 compiler bugs for Cray+OpenACC builds
Mar 5, 2026
1aa4cf5
Temporarily disable Phoenix + Frontier AMD CI (pre-existing failures …
Mar 5, 2026
835a2b8
Address code review findings: bounds warning, assert patch applied, C…
Mar 5, 2026
ddcaa4a
Temporarily disable Phoenix (NVHPC) benchmark jobs (QOS job limit iss…
Mar 5, 2026
6314820
Address CodeRabbit review: CCE PROHIBIT guards + pyrometheus forward-…
Mar 5, 2026
c274109
Add comment noting pyrometheus upstream issue for thermochem GPU_ROUT…
Mar 5, 2026
05d1dc0
Merge branch 'master' into fix/cce-cray-inline-routine
sbryngelson Mar 5, 2026
1dadcc3
Fix Frontier benchmark SLURM: use batch+1:59+normal QOS
Mar 6, 2026
e208275
Fix bench.yml: restore timeout-minutes to 480 (revert accidental 240)
Mar 6, 2026
2d1b359
Address review: CCE_MAX_SPECIES constant, GPU error for n_species ove…
Mar 6, 2026
810056d
Fix ##-> #! Fypp comment in m_chemistry.fpp top-level scope
Mar 6, 2026
8a6398c
Fix ##-> #! Fypp comment in parallel_macros.fpp cray_noinline block
Mar 6, 2026
23309f6
Extend -Oipa0 workaround to all Cray builds, not just Cray+OpenACC
Mar 6, 2026
5d177b7
Fix -Oipa0 guard: exclude Cray+OpenMP, cover Cray+OpenACC and Cray CPU
Mar 6, 2026
9fc072a
Remove persistent build cache for self-hosted test runners
sbryngelson Mar 6, 2026
2cdade9
Remove build cache from benchmark jobs on Phoenix and Frontier
sbryngelson Mar 6, 2026
6e97695
Fix submit.sh to survive monitor SIGKILL by re-checking SLURM state
sbryngelson Mar 6, 2026
61924d8
Extract monitor SIGKILL recovery into shared run_monitored_slurm_job.sh
sbryngelson Mar 6, 2026
ac28127
bench: update Phoenix tmpbuild path to project storage
sbryngelson Mar 7, 2026
8db8807
Re-enable Phoenix NVHPC and Frontier AMD in CI workflows
sbryngelson Mar 7, 2026
4e6482d
Remove Phoenix cpu from bench matrix
sbryngelson Mar 7, 2026
878fddb
Remove CCE VLA guard from m_chemistry.fpp; slim CI to Frontier (CCE) …
Mar 8, 2026
86f864d
Restore GitHub, Phoenix, and Frontier AMD CI/bench jobs
Mar 8, 2026
c17653f
toolchain: log warning when pyrometheus CCE workaround patch applies
Mar 8, 2026
24ea0cb
ci: restore diff-based Intel oneAPI env export to avoid GITHUB_ENV co…
Mar 8, 2026
f881888
Restore full case-optimization matrix and Frontier AMD sharding
Mar 8, 2026
fd06e97
Adopt CI improvements from PR #1295
Mar 8, 2026
917cdd5
Re-trigger CI
Mar 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions .github/scripts/run_monitored_slurm_job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash
# Run monitor_slurm_job.sh and recover if the monitor is killed (e.g. SIGKILL
# from the runner OS) before the SLURM job completes. When the monitor exits
# non-zero, sacct is used to verify the job's actual final state; if the SLURM
# job succeeded we exit 0 so the CI step is not falsely marked as failed.
#
# Usage: run_monitored_slurm_job.sh <job_id> <output_file>

set -euo pipefail

if [ $# -ne 2 ]; then
echo "Usage: $0 <job_id> <output_file>"
exit 1
fi

job_id="$1"
output_file="$2"

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

monitor_exit=0
bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?

if [ "$monitor_exit" -ne 0 ]; then
echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
# Give the SLURM epilog time to finalize if the job just finished
sleep 30
final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
echo "Final SLURM state=$final_state exit=$final_exit"
if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
else
echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
exit 1
fi
fi
5 changes: 1 addition & 4 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,7 @@ build_opts="$gpu_opts"

. ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")

# Only set up build cache for test suite, not benchmarks
if [ "$run_bench" != "bench" ]; then
source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
fi
rm -rf build

source .github/scripts/retry-build.sh
if [ "$run_bench" == "bench" ]; then
Expand Down
11 changes: 5 additions & 6 deletions .github/workflows/frontier/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ fi

# Select SBATCH params based on job type
if [ "$job_type" = "bench" ]; then
sbatch_account="#SBATCH -A ENG160"
sbatch_time="#SBATCH -t 05:59:00"
sbatch_partition="#SBATCH -p extended"
sbatch_extra=""
sbatch_account="#SBATCH -A CFD154"
sbatch_time="#SBATCH -t 01:59:00"
sbatch_partition="#SBATCH -p batch"
sbatch_extra="#SBATCH --qos=normal"
else
sbatch_account="#SBATCH -A CFD154"
sbatch_time="#SBATCH -t 01:59:00"
Expand Down Expand Up @@ -102,5 +102,4 @@ fi

echo "Submitted batch job $job_id"

# Use resilient monitoring instead of sbatch -W
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"
4 changes: 3 additions & 1 deletion .github/workflows/phoenix/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

source .github/scripts/bench-preamble.sh

tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
mkdir -p $tmpbuild
mkdir -p $currentdir
Expand All @@ -15,6 +15,8 @@ else
bench_opts="--mem 1"
fi

rm -rf build

source .github/scripts/retry-build.sh
RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1

Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/phoenix/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,5 @@ fi

echo "Submitted batch job $job_id"

# Use resilient monitoring instead of sbatch -W
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"
3 changes: 1 addition & 2 deletions .github/workflows/phoenix/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
source .github/scripts/gpu-opts.sh
build_opts="$gpu_opts"

# Set up persistent build cache
source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
rm -rf build

# Build with retry; smoke-test cached binaries to catch architecture mismatches
# (SIGILL from binaries compiled on a different compute node).
Expand Down
66 changes: 36 additions & 30 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,12 @@ jobs:
- name: Clone
uses: actions/checkout@v4

- name: Restore Build Cache
uses: actions/cache@v4
with:
path: build
key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}

- name: Setup MacOS
if: matrix.os == 'macos'
run: |
Expand Down Expand Up @@ -131,32 +137,20 @@ jobs:
printenv | sort > /tmp/env_after
diff /tmp/env_before /tmp/env_after | grep '^>' | sed 's/^> //' >> $GITHUB_ENV

- name: Get system info for cache key
id: sys-info
run: |
{
uname -m
cat /proc/cpuinfo 2>/dev/null | grep 'model name' | head -1 || sysctl -n machdep.cpu.brand_string 2>/dev/null || true
if command -v ifx &>/dev/null; then ifx --version 2>/dev/null | head -1; else ${FC:-gfortran} --version 2>/dev/null | head -1 || true; fi
${CC:-gcc} --version 2>/dev/null | head -1 || true
} | (sha256sum 2>/dev/null || shasum -a 256) | cut -c1-16 > /tmp/sys-hash
echo "sys-hash=$(cat /tmp/sys-hash)" >> "$GITHUB_OUTPUT"

- name: Restore Build Cache
uses: actions/cache@v4
- name: Set up Python 3.14
uses: actions/setup-python@v5
with:
path: build
key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ steps.sys-info.outputs.sys-hash }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
python-version: '3.14'

- name: Build
run: |
/bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} $PRECISION $TEST_ALL
/bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL
env:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
PRECISION: ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }}

- name: Test
run: bash .github/scripts/run-tests-with-retry.sh -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT
run: |
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
env:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
Expand Down Expand Up @@ -186,7 +180,7 @@ jobs:
cluster_name: 'Georgia Tech | Phoenix'
device: 'cpu'
interface: 'none'
# Frontier (ORNL) — build on login node, GPU tests sharded for batch partition
# Frontier (ORNL) — CCE
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
Expand Down Expand Up @@ -243,21 +237,30 @@ jobs:
- name: Clone
uses: actions/checkout@v4
with:
clean: false
clean: true

- name: Build
if: matrix.cluster != 'phoenix'
uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3
with:
max_attempts: 3
max_attempts: 2
retry_wait_seconds: 60
timeout_minutes: 60
command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
on_retry_command: ./mfc.sh clean
on_retry_command: rm -rf build

- name: Test
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}

- name: Cancel SLURM Jobs
if: cancelled()
run: |
find . -name "*.slurm_job_id" | while read -r f; do
job_id=$(cat "$f")
echo "Cancelling SLURM job $job_id"
scancel "$job_id" 2>/dev/null || true
done

- name: Compute Log Slug
if: always()
id: log
Expand Down Expand Up @@ -321,25 +324,28 @@ jobs:
- name: Clone
uses: actions/checkout@v4
with:
clean: false
clean: true

- name: Pre-Build (SLURM)
if: matrix.cluster == 'phoenix'
run: bash .github/workflows/phoenix/submit.sh .github/scripts/prebuild-case-optimization.sh ${{ matrix.device }} ${{ matrix.interface }}

- name: Pre-Build (login node)
if: matrix.cluster != 'phoenix'
uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3
with:
max_attempts: 3
retry_wait_seconds: 60
timeout_minutes: 120
command: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
on_retry_command: ./mfc.sh clean
run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}

- name: Run Case-Optimization Tests
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }}

- name: Cancel SLURM Jobs
if: cancelled()
run: |
find . -name "*.slurm_job_id" | while read -r f; do
job_id=$(cat "$f")
echo "Cancelling SLURM job $job_id"
scancel "$job_id" 2>/dev/null || true
done

- name: Print Logs
if: always()
run: |
Expand Down
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,8 @@ benchmarks/*.png
*.avi

**isolation_rules/
**.supercode/
**.supercode/
# CCE stress-test log directories (local testing artifacts)
cce_*/
cce_*.log
run_cce_*.sh
43 changes: 36 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -224,13 +224,24 @@ endif()

if (CMAKE_BUILD_TYPE STREQUAL "Release")
# Processor tuning: Check if we can target the host's native CPU's ISA.
CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE)
if (SUPPORTS_MARCH_NATIVE)
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-march=native>)
else()
CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE)
if (SUPPORTS_MCPU_NATIVE)
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mcpu=native>)
# Skip for gcov builds — -march=native on newer CPUs (e.g. Granite Rapids)
# can emit instructions the system assembler doesn't support.
if (NOT MFC_GCov)
CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE)
if (SUPPORTS_MARCH_NATIVE)
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-march=native>)
# Disable AVX-512 FP16: gfortran >=12 emits vmovw instructions on
# Granite Rapids CPUs, but binutils <2.38 cannot assemble them.
# FP16 is unused in MFC's double-precision computations.
CHECK_FORTRAN_COMPILER_FLAG("-mno-avx512fp16" SUPPORTS_MNO_AVX512FP16)
if (SUPPORTS_MNO_AVX512FP16)
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mno-avx512fp16>)
endif()
else()
CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE)
if (SUPPORTS_MCPU_NATIVE)
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mcpu=native>)
endif()
endif()
endif()

Expand Down Expand Up @@ -397,6 +408,7 @@ HANDLE_SOURCES(simulation ON)
HANDLE_SOURCES(post_process ON)
HANDLE_SOURCES(syscheck OFF)


# MFC_SETUP_TARGET: Given a target (herein <target>), this macro creates a new
# executable <target> with the appropriate sources, compiler definitions, and
# linked libraries (assuming HANDLE_SOURCES was called on <target>).
Expand Down Expand Up @@ -633,6 +645,23 @@ if (MFC_SIMULATION)
MFC_SETUP_TARGET(TARGET simulation
SOURCES "${simulation_SRCs}"
MPI FFTW OpenACC OpenMP)
# CCE 19.0.0 IPA workaround: two files trigger IPA crashes:
# m_bubbles_EL: castIsValid assertion (InstCombine/foldIntegerTypedPHI)
# m_phase_change: bring_routine_resident SIGSEGV
# Disabling IPA per-file avoids the crashes while preserving IPA for
# the rest of simulation (needed for thermochem INLINEALWAYS inlining).
# Applied to Cray+OpenACC and Cray CPU, but NOT Cray+OpenMP: on OpenMP,
# m_thermochem uses !DIR$ INLINEALWAYS (requires IPA), so disabling IPA
# for these files breaks thermochem on-device calls. On OpenACC the
# pyrometheus patch emits !$acc routine seq instead (no IPA needed).
# See PR #1286.
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray" AND NOT MFC_OpenMP)
set_source_files_properties(
"${CMAKE_BINARY_DIR}/fypp/simulation/m_bubbles_EL.fpp.f90"
"${CMAKE_BINARY_DIR}/fypp/simulation/m_phase_change.fpp.f90"
PROPERTIES COMPILE_OPTIONS "-Oipa0"
)
endif()
endif()

if (MFC_POST_PROCESS)
Expand Down
32 changes: 30 additions & 2 deletions src/common/include/parallel_macros.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,46 @@

#:enddef

#:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, extraAccArgs=None, extraOmpArgs=None)
#:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, cray_noinline=False, extraAccArgs=None, extraOmpArgs=None)
#:assert isinstance(cray_inline, bool)
#:assert isinstance(cray_noinline, bool)
#:assert not (cray_inline and cray_noinline), "cray_inline and cray_noinline are mutually exclusive"
#:set acc_directive = ACC_ROUTINE(function_name=function_name, parallelism=parallelism, nohost=nohost, extraAccArgs=extraAccArgs)
#:set omp_directive = OMP_ROUTINE(function_name=function_name, nohost=nohost, extraOmpArgs=extraOmpArgs)

#:if cray_inline == True
#:if cray_noinline == True
#:if not isinstance(function_name, str)
#:stop "When using cray_noinline, function name must be given and given as a string"
#:endif
#:set cray_noinline_directive = ('!DIR$ NOINLINE ' + function_name).strip('\n')
#ifdef _CRAYFTN
#if MFC_OpenACC
$:acc_directive
#elif MFC_OpenMP
$:omp_directive
#else
$:cray_noinline_directive
#endif
#! On non-Cray CPU builds (no _CRAYFTN, no MFC_OpenACC, no MFC_OpenMP), nothing is
#! emitted — intentional, since !DIR$ NOINLINE is a Cray-specific directive.
#elif MFC_OpenACC
$:acc_directive
#elif MFC_OpenMP
$:omp_directive
#endif
#:elif cray_inline == True
#:if not isinstance(function_name, str)
#:stop "When inlining for Cray Compiler, function name must be given and given as a string"
#:endif
#:set cray_directive = ('!DIR$ INLINEALWAYS ' + function_name).strip('\n')
#ifdef _CRAYFTN
#if MFC_OpenACC
$:acc_directive
#elif MFC_OpenMP
$:omp_directive
#else
$:cray_directive
#endif
#elif MFC_OpenACC
$:acc_directive
#elif MFC_OpenMP
Expand Down
Loading
Loading