[go: nahoru, domu]

Skip to content

Commit

Permalink
Introduce hermetic CUDA in Google ML projects.
Browse files Browse the repository at this point in the history
Instead of having pre-installed NVIDIA CUDA and CUDNN libraries and setting environment variables pointing to the installation locations, Bazel should automatically download CUDA and CUDNN distributives in the cache and use them during build and test phases.

PiperOrigin-RevId: 616865795
  • Loading branch information
tensorflower-gardener committed Jun 12, 2024
1 parent 7817eb0 commit 9354d5b
Show file tree
Hide file tree
Showing 133 changed files with 6,281 additions and 885 deletions.
25 changes: 10 additions & 15 deletions .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -225,13 +225,17 @@ build:mkl_aarch64_threadpool -c opt
build:cuda --repo_env TF_NEED_CUDA=1
build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
build:cuda --@local_config_cuda//:enable_cuda
# Default CUDA and CUDNN versions.
build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.3.2"
build:cuda --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29"
# This flag is needed to include hermetic CUDA libraries for bazel tests.
test:cuda --@local_config_cuda//cuda:include_hermetic_cuda_libs=true

# CUDA: This config refers to building CUDA op kernels with clang.
build:cuda_clang --config=cuda
# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt
build:cuda_clang --config=tensorrt
build:cuda_clang --action_env=TF_CUDA_CLANG="1"
build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
build:cuda_clang --copt=-Qunused-arguments
# Select supported compute capabilities (supported graphics cards).
# This is the same as the official TensorFlow builds.
# See https://developer.nvidia.com/cuda-gpus#compute
Expand All @@ -240,16 +244,14 @@ build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
# release while SASS is only forward compatible inside the current
# major release. Example: sm_80 kernels can run on sm_89 GPUs but
# not on sm_90 GPUs. compute_80 kernels though can also run on sm_90 GPUs.
build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_89,compute_90"
build:cuda_clang --repo_env=HERMETIC_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_89,compute_90"

# Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
build:cuda_clang_official --config=cuda_clang
build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
build:cuda_clang_official --repo_env=HERMETIC_CUDA_VERSION="12.3.2"
build:cuda_clang_official --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29"
build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"

# Build with nvcc for CUDA and clang for host
Expand Down Expand Up @@ -546,10 +548,6 @@ build:rbe_linux_cuda --config=cuda_clang_official
build:rbe_linux_cuda --config=rbe_linux_cpu
# For Remote build execution -- GPU configuration
build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"

build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
build:rbe_linux_cuda_nvcc --config=nvcc_clang
Expand Down Expand Up @@ -634,7 +632,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/cla
# Test-related settings below this point.
test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
test:release_linux_base --local_test_jobs=HOST_CPUS
test:release_linux_base --test_env=LD_LIBRARY_PATH
# Give only the list of failed tests at the end of the log
test:release_linux_base --test_summary=short

Expand All @@ -646,7 +643,6 @@ build:release_gpu_linux --config=release_cpu_linux
# Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
# Note that linux cpu and cuda builds share the same toolchain now.
build:release_gpu_linux --config=cuda_clang_official
test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
# Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute

Expand Down Expand Up @@ -677,9 +673,8 @@ build:unsupported_gpu_linux --config=unsupported_cpu_linux
build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11"
build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8"
build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
build:unsupported_gpu_linux --config=tensorrt
build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib"
build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64"
build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain

Expand Down
33 changes: 33 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,36 @@ tf_workspace1()
load("@//tensorflow:workspace0.bzl", "tf_workspace0")

tf_workspace0()

load(
"@local_tsl//third_party/gpus/cuda:hermetic_cuda_json_init_repository.bzl",
"CUDA_REDIST_JSON_DICT",
"CUDNN_REDIST_JSON_DICT",
"hermetic_cuda_json_init_repository",
)

hermetic_cuda_json_init_repository(
cuda_json_dict = CUDA_REDIST_JSON_DICT,
cudnn_json_dict = CUDNN_REDIST_JSON_DICT,
)

load(
"@cuda_redist_json//:distributions.bzl",
"CUDA_DISTRIBUTIONS",
"CUDNN_DISTRIBUTIONS",
)
load(
"@local_tsl//third_party/gpus/cuda:hermetic_cuda_redist_init_repositories.bzl",
"CUDA_DIST_PATH_PREFIX",
"CUDA_NCCL_WHEELS",
"CUDNN_DIST_PATH_PREFIX",
"hermetic_cuda_redist_init_repositories",
)

hermetic_cuda_redist_init_repositories(
cuda_dist_path_prefix = CUDA_DIST_PATH_PREFIX,
cuda_distributions = CUDA_DISTRIBUTIONS,
cuda_nccl_wheels = CUDA_NCCL_WHEELS,
cudnn_dist_path_prefix = CUDNN_DIST_PATH_PREFIX,
cudnn_distributions = CUDNN_DISTRIBUTIONS,
)
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ EOF
bazel cquery \
--experimental_cc_shared_library \
--@local_config_cuda//:enable_cuda \
--repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
--repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
"somepath(//tensorflow/tools/pip_package:build_pip_package, " \
"@local_config_cuda//cuda:cudart + "\
"@local_config_cuda//cuda:cudart + "\
Expand All @@ -237,6 +239,8 @@ EOF
bazel cquery \
--experimental_cc_shared_library \
--@local_config_cuda//:enable_cuda \
--repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
--repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
--define framework_shared_object=false \
"somepath(//tensorflow/tools/pip_package:build_pip_package, " \
"@local_config_cuda//cuda:cudart + "\
Expand Down
4 changes: 4 additions & 0 deletions ci/official/utilities/code_check_full.bats
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,8 @@ EOF
bazel cquery \
--experimental_cc_shared_library \
--@local_config_cuda//:enable_cuda \
--repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
--repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
"somepath(//tensorflow/tools/pip_package:wheel, " \
"@local_config_cuda//cuda:cudart + "\
"@local_config_cuda//cuda:cudart + "\
Expand All @@ -236,6 +238,8 @@ EOF
bazel cquery \
--experimental_cc_shared_library \
--@local_config_cuda//:enable_cuda \
--repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
--repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
--define framework_shared_object=false \
"somepath(//tensorflow/tools/pip_package:wheel, " \
"@local_config_cuda//cuda:cudart + "\
Expand Down
Loading

0 comments on commit 9354d5b

Please sign in to comment.