Skip to content

Nightly GPU Benchmarks #41

Nightly GPU Benchmarks

Nightly GPU Benchmarks #41

# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
name: Nightly GPU Benchmarks
permissions:
contents: read
on:
workflow_dispatch: # Allows manual triggering
schedule:
- cron: "0 0 * * *" # Run at midnight every day
jobs:
Tests:
strategy:
# Don't fail fast - want to see results for all builds even if one fails.
fail-fast: false
matrix:
job_info: [
{
os: "linux-x86-g2-48-l4-4gpu",
container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
pretty_name: "XLA Linux x86 GPU L4 48 vcpu Presubmit",
},
{
os: "linux-x86-g2-16-l4-1gpu",
container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
pretty_name: "XLA Linux x86 GPU L4 16 vcpu Presubmit",
},
{
os: "linux-x86-a4-224-b200-1gpu",
container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-cuda12.8-cudnn9.8:latest",
pretty_name: "XLA Linux x86 GPU A4 224 vcpu Presubmit",
},
# Expect more GPU types in the future.
]
name: ${{ matrix.job_info.pretty_name }}
runs-on: ${{ matrix.job_info.os }}
container: ${{ matrix.job_info.container }}
defaults:
run:
shell: bash
timeout-minutes: 240
env:
OUTPUT_DIR: ${{ github.workspace }}/output
steps:
- name: Checkout XLA
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- name: Download Gemma Hlo Files
run: |
mkdir -p tmp_hlo
cd tmp_hlo
wget https://storage.googleapis.com/xla-benchmarking-temp/gemma2_2b_keras_jax.hlo
wget https://storage.googleapis.com/xla-benchmarking-temp/gemma3_1b_flax_call.hlo
wget https://storage.googleapis.com/xla-benchmarking-temp/gemma3_1b_flax_sample_loop.hlo
cd ..
- name: Configure GPU backend
run: |
./configure.py --backend=CUDA --cuda_compiler=nvcc
- name: "Run build.py"
run: |
./build_tools/ci/build.py --build="${{ matrix.job_info.pretty_name }}_github_actions"
- name: Run HLO tests and collect data
run: |
binary_dir="./bazel-out/k8-opt/bin/xla/tools"
mkdir -p "$OUTPUT_DIR"
# Run gpu_hlo_backend.hlo
HLO_FILE_GB="xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo"
OUTPUT_PREFIX_GB="$OUTPUT_DIR/gpu_hlo_backend"
echo "Running GPU test: $HLO_FILE_GB"
$binary_dir/multihost_hlo_runner/hlo_runner_main_gpu \
--device_type=gpu \
--num_repeats=5 \
--use_spmd_partitioning \
--profile_execution=True \
--xla_gpu_dump_xspace_to="${OUTPUT_PREFIX_GB}_xspace.pb" \
"$HLO_FILE_GB" \
> "${OUTPUT_PREFIX_GB}.txt"
$binary_dir/compute_xspace_stats_main_gpu \
--input="${OUTPUT_PREFIX_GB}_xspace.pb" \
--device_type=GPU \
>> "${OUTPUT_PREFIX_GB}.txt"
cat "${OUTPUT_PREFIX_GB}.txt"
# Run gemma2_2b_keras_jax.hlo
HLO_FILE_GEMMA="tmp_hlo/gemma2_2b_keras_jax.hlo"
OUTPUT_PREFIX_GEMMA="$OUTPUT_DIR/gemma2_2b_keras_jax"
echo "Running GPU test: $HLO_FILE_GEMMA"
$binary_dir/multihost_hlo_runner/hlo_runner_main_gpu \
--device_type=gpu \
--num_repeats=5 \
--use_spmd_partitioning \
--profile_execution=True \
--xla_gpu_dump_xspace_to="${OUTPUT_PREFIX_GEMMA}_xspace.pb" \
"$HLO_FILE_GEMMA" \
> "${OUTPUT_PREFIX_GEMMA}.txt"
$binary_dir/compute_xspace_stats_main_gpu \
--input="${OUTPUT_PREFIX_GEMMA}_xspace.pb" \
--device_type=GPU \
>> "${OUTPUT_PREFIX_GEMMA}.txt"
cat "${OUTPUT_PREFIX_GEMMA}.txt"
echo "Output written to: ${OUTPUT_PREFIX_GB}.txt and ${OUTPUT_PREFIX_GEMMA}.txt"
# Run gemma3_1b_flax_call.hlo
HLO_FILE_GEMMA3_CALL="tmp_hlo/gemma3_1b_flax_call.hlo"
OUTPUT_PREFIX_GEMMA3_CALL="$OUTPUT_DIR/gemma3_1b_flax_call"
echo "Running GPU test: $HLO_FILE_GEMMA3_CALL"
$binary_dir/multihost_hlo_runner/hlo_runner_main_gpu \
--device_type=gpu \
--num_repeats=5 \
--use_spmd_partitioning \
--profile_execution=True \
--xla_gpu_dump_xspace_to="${OUTPUT_PREFIX_GEMMA3_CALL}_xspace.pb" \
"$HLO_FILE_GEMMA3_CALL" \
> "${OUTPUT_PREFIX_GEMMA3_CALL}.txt"
$binary_dir/compute_xspace_stats_main_gpu \
--input="${OUTPUT_PREFIX_GEMMA3_CALL}_xspace.pb" \
--device_type=GPU \
>> "${OUTPUT_PREFIX_GEMMA3_CALL}.txt"
cat "${OUTPUT_PREFIX_GEMMA3_CALL}.txt"
echo "Output written to: ${OUTPUT_PREFIX_GEMMA3_CALL}.txt"
# Run gemma3_1b_flax_sample_loop.hlo
HLO_FILE_GEMMA3_SAMPLE_LOOP="tmp_hlo/gemma3_1b_flax_sample_loop.hlo"
OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP="$OUTPUT_DIR/gemma3_1b_flax_sample_loop"
echo "Running GPU test: $HLO_FILE_GEMMA3_SAMPLE_LOOP"
$binary_dir/multihost_hlo_runner/hlo_runner_main_gpu \
--device_type=gpu \
--num_repeats=5 \
--use_spmd_partitioning \
--profile_execution=True \
--xla_gpu_dump_xspace_to="${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}_xspace.pb" \
"$HLO_FILE_GEMMA3_SAMPLE_LOOP" \
> "${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}.txt"
$binary_dir/compute_xspace_stats_main_gpu \
--input="${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}_xspace.pb" \
--device_type=GPU \
>> "${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}.txt"
cat "${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}.txt"
echo "Output written to: ${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}.txt"
- name: Upload HLO test output to GCS
run: |
GCS_BUCKET="gs://openxla-nightly-transient"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
DATE_FOLDER=$(date +%Y%m%d)
COMMIT_SHA="${{ github.sha }}"
RUN_ID="${{ github.run_id }}"
upload_to_gcs() {
local base_name="$1"
local gcs_file_name="${DATE_FOLDER}/${TIMESTAMP}_${{ matrix.job_info.os }}_run_${RUN_ID}_commit_${COMMIT_SHA}_${base_name}.txt"
echo "Uploading $OUTPUT_DIR/${base_name}.txt to $GCS_BUCKET/$gcs_file_name"
gsutil cp "$OUTPUT_DIR/${base_name}.txt" "$GCS_BUCKET/$gcs_file_name"
}
# Upload output for gpu_hlo_backend
GB_BASE_NAME="gpu_hlo_backend"
upload_to_gcs "$GB_BASE_NAME"
# Upload output for gemma2_2b_keras_jax
GEMMA_BASE_NAME="gemma2_2b_keras_jax"
upload_to_gcs "$GEMMA_BASE_NAME"
# Upload output for gemma3_1b_flax_call
GEMMA3_CALL_BASE_NAME="gemma3_1b_flax_call"
upload_to_gcs "$GEMMA3_CALL_BASE_NAME"
# Upload output for gemma3_1b_flax_sample_loop
GEMMA3_SAMPLE_LOOP_BASE_NAME="gemma3_1b_flax_sample_loop"
upload_to_gcs "$GEMMA3_SAMPLE_LOOP_BASE_NAME"
- name: Upload XSpace artifacts
uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
with:
name: gpu-xla-benchmarks-xspace-${{ matrix.job_info.os }}
path: ${{ github.workspace }}/output/*_xspace.pb