[go: nahoru, domu]

Skip to content

Commit

Permalink
Make the CI at least build unsupported GPU test targets
Browse files Browse the repository at this point in the history
This should allow uncovering build failures for tests that require
hardware that is not available in a presubmit - like H100 GPUs
at the time of writing this.

PiperOrigin-RevId: 632440611
  • Loading branch information
beckerhe authored and tensorflower-gardener committed Jun 26, 2024
1 parent 9be1659 commit 3e30c2e
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 22 deletions.
69 changes: 48 additions & 21 deletions third_party/xla/build_tools/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,22 +159,42 @@ class Build:
docker_image: DockerImage
target_patterns: Tuple[str, ...]
configs: Tuple[str, ...] = ()
tag_filters: Tuple[str, ...] = ()
build_tag_filters: Tuple[str, ...] = ()
test_tag_filters: Tuple[str, ...] = ()
action_env: Dict[str, Any] = dataclasses.field(default_factory=dict)
test_env: Dict[str, Any] = dataclasses.field(default_factory=dict)
options: Dict[str, Any] = dataclasses.field(default_factory=dict)

def bazel_test_command(self) -> List[str]:
def _common_bazel_options(self) -> List[str]:
options = _dict_to_cli_options(self.options)
configs = [f"--config={config}" for config in self.configs]
build_tag_filters = f"--build_tag_filters={','.join(self.tag_filters)}"
test_tag_filters = f"--test_tag_filters={','.join(self.tag_filters)}"
build_tag_filters = (
f"--build_tag_filters={','.join(self.build_tag_filters)}"
)
test_tag_filters = f"--test_tag_filters={','.join(self.test_tag_filters)}"
action_env = [f"--action_env={k}={v}" for k, v in self.action_env.items()]
test_env = [f"--test_env={k}={v}" for k, v in self.test_env.items()]

tag_filters = [build_tag_filters, test_tag_filters]
all_options = tag_filters + configs + action_env + test_env + options
return ["bazel", "test", *all_options, "--", *self.target_patterns]
return tag_filters + configs + action_env + test_env + options

def bazel_test_command(self) -> List[str]:
return [
"bazel",
"test",
*self._common_bazel_options(),
"--",
*self.target_patterns,
]

def bazel_build_command(self) -> List[str]:
return [
"bazel",
"build",
*self._common_bazel_options(),
"--",
*self.target_patterns,
]


def _tag_filters_for_compute_capability(
Expand Down Expand Up @@ -214,7 +234,8 @@ def nvidia_gpu_build_with_compute_capability(
docker_image=_CUDNN_9_IMAGE,
target_patterns=_XLA_DEFAULT_TARGET_PATTERNS,
configs=configs,
tag_filters=("-no_oss", "requires-gpu-nvidia") + extra_gpu_tags,
test_tag_filters=("-no_oss", "requires-gpu-nvidia") + extra_gpu_tags,
build_tag_filters=("-no_oss", "requires-gpu-nvidia"),
options=dict(
run_under="//tools/ci_build/gpu_build:parallel_gpu_execute",
repo_env=f"TF_CUDA_COMPUTE_CAPABILITIES={compute_capability/10}",
Expand All @@ -223,34 +244,39 @@ def nvidia_gpu_build_with_compute_capability(
)


cpu_x86_tags = (
"-no_oss",
"-gpu",
"-requires-gpu-nvidia",
"-requires-gpu-amd",
)
_CPU_X86_BUILD = Build(
type_=BuildType.CPU_X86,
repo="openxla/xla",
docker_image=_DEFAULT_IMAGE,
configs=("warnings", "nonccl", "rbe_linux_cpu"),
target_patterns=_XLA_DEFAULT_TARGET_PATTERNS + ("-//xla/service/gpu/...",),
tag_filters=(
"-no_oss",
"-gpu",
"-requires-gpu-nvidia",
"-requires-gpu-amd",
),
build_tag_filters=cpu_x86_tags,
test_tag_filters=cpu_x86_tags,
options=_DEFAULT_BAZEL_OPTIONS,
)

cpu_arm_tags = (
"-no_oss",
"-gpu",
"-requires-gpu-nvidia",
"-requires-gpu-amd",
"-not_run:arm",
)
_CPU_ARM64_BUILD = Build(
type_=BuildType.CPU_ARM64,
repo="openxla/xla",
docker_image=_ARM64_JAX_MULTI_PYTHON_IMAGE,
configs=("warnings", "rbe_cross_compile_linux_arm64_xla", "nonccl"),
target_patterns=_XLA_DEFAULT_TARGET_PATTERNS + ("-//xla/service/gpu/...",),
tag_filters=(
"-no_oss",
"-gpu",
"-requires-gpu-nvidia",
"-not_run:arm",
"-requires-gpu-amd",
),
options={**_DEFAULT_BAZEL_OPTIONS, "build_tests_only": True},
build_tag_filters=cpu_arm_tags,
test_tag_filters=cpu_arm_tags,
)
# TODO(ddunleavy): Setup additional build for a100 tests once L4 RBE is ready.
_GPU_BUILD = nvidia_gpu_build_with_compute_capability(
Expand Down Expand Up @@ -290,7 +316,8 @@ def nvidia_gpu_build_with_compute_capability(
"tensorflow_testing_rbe_linux",
),
target_patterns=("//tests:gpu_tests", "//tests:backend_independent_tests"),
tag_filters=("-multiaccelerator",),
build_tag_filters=("-multiaccelerator",),
test_tag_filters=("-multiaccelerator",),
test_env=dict(
JAX_SKIP_SLOW_TESTS=1,
TF_CPP_MIN_LOG_LEVEL=0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ class MultiHeadedAttentionTest : public GpuCodegenTest {
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> test_module,
ParseAndReturnVerifiedModule(hlo_string));
TF_ASSERT_OK_AND_ASSIGN(num_fmha_calls,
CountFMHACalls(test_module->Clone()));
CountFMHACalls(std::move(test_module->Clone())));
EXPECT_EQ(num_fmha_calls, expected_num_fmha_calls);
const Literal actual_result =
ExecuteAndTransfer(std::move(test_module), literals);
Expand Down

0 comments on commit 3e30c2e

Please sign in to comment.