Introduce hermetic CUDA in Google ML projects.

Instead of having pre-installed NVIDIA CUDA and CUDNN libraries and setting environment variables pointing to the installation locations, Bazel should automatically download CUDA and CUDNN distributives in the cache and use them during build and test phases. PiperOrigin-RevId: 616865795
tensorflow · Jun 12, 2024 · 9354d5b · 9354d5b
1 parent 7817eb0
commit 9354d5b
Show file tree

Hide file tree

Showing 133 changed files with 6,281 additions and 885 deletions.
diff --git a/.bazelrc b/.bazelrc
@@ -225,13 +225,17 @@ build:mkl_aarch64_threadpool -c opt
 build:cuda --repo_env TF_NEED_CUDA=1
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --@local_config_cuda//:enable_cuda
+# Default CUDA and CUDNN versions.
+build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.3.2"
+build:cuda --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29"
+# This flag is needed to include hermetic CUDA libraries for bazel tests.
+test:cuda --@local_config_cuda//cuda:include_hermetic_cuda_libs=true
 
 # CUDA: This config refers to building CUDA op kernels with clang.
 build:cuda_clang --config=cuda
-# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt
-build:cuda_clang --config=tensorrt
 build:cuda_clang --action_env=TF_CUDA_CLANG="1"
 build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
+build:cuda_clang --copt=-Qunused-arguments
 # Select supported compute capabilities (supported graphics cards).
 # This is the same as the official TensorFlow builds.
 # See https://developer.nvidia.com/cuda-gpus#compute
@@ -240,16 +244,14 @@ build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
 # release while SASS is only forward compatible inside the current
 # major release. Example: sm_80 kernels can run on sm_89 GPUs but
 # not on sm_90 GPUs. compute_80 kernels though can also run on sm_90 GPUs.
-build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_89,compute_90"
+build:cuda_clang --repo_env=HERMETIC_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_89,compute_90"
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
-build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
-build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
-build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
+build:cuda_clang_official --repo_env=HERMETIC_CUDA_VERSION="12.3.2"
+build:cuda_clang_official --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
-build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
@@ -546,10 +548,6 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
-test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --config=nvcc_clang
@@ -634,7 +632,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/cla
 # Test-related settings below this point.
 test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
 test:release_linux_base --local_test_jobs=HOST_CPUS
-test:release_linux_base --test_env=LD_LIBRARY_PATH
 # Give only the list of failed tests at the end of the log
 test:release_linux_base --test_summary=short
 
@@ -646,7 +643,6 @@ build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 # Note that linux cpu and cuda builds share the same toolchain now.
 build:release_gpu_linux --config=cuda_clang_official
-test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 # Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
 test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
 
@@ -677,9 +673,8 @@ build:unsupported_gpu_linux --config=unsupported_cpu_linux
 build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11"
 build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8"
 build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
-build:unsupported_gpu_linux --config=tensorrt
 build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
-build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib"
+build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64"
 build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 

diff --git a/WORKSPACE b/WORKSPACE
@@ -101,3 +101,36 @@ tf_workspace1()
 load("@//tensorflow:workspace0.bzl", "tf_workspace0")
 
 tf_workspace0()
+
+load(
+    "@local_tsl//third_party/gpus/cuda:hermetic_cuda_json_init_repository.bzl",
+    "CUDA_REDIST_JSON_DICT",
+    "CUDNN_REDIST_JSON_DICT",
+    "hermetic_cuda_json_init_repository",
+)
+
+hermetic_cuda_json_init_repository(
+    cuda_json_dict = CUDA_REDIST_JSON_DICT,
+    cudnn_json_dict = CUDNN_REDIST_JSON_DICT,
+)
+
+load(
+    "@cuda_redist_json//:distributions.bzl",
+    "CUDA_DISTRIBUTIONS",
+    "CUDNN_DISTRIBUTIONS",
+)
+load(
+    "@local_tsl//third_party/gpus/cuda:hermetic_cuda_redist_init_repositories.bzl",
+    "CUDA_DIST_PATH_PREFIX",
+    "CUDA_NCCL_WHEELS",
+    "CUDNN_DIST_PATH_PREFIX",
+    "hermetic_cuda_redist_init_repositories",
+)
+
+hermetic_cuda_redist_init_repositories(
+    cuda_dist_path_prefix = CUDA_DIST_PATH_PREFIX,
+    cuda_distributions = CUDA_DISTRIBUTIONS,
+    cuda_nccl_wheels = CUDA_NCCL_WHEELS,
+    cudnn_dist_path_prefix = CUDNN_DIST_PATH_PREFIX,
+    cudnn_distributions = CUDNN_DISTRIBUTIONS,
+)
diff --git a/ci/official/containers/linux_arm64/devel.usertools/code_check_full.bats b/ci/official/containers/linux_arm64/devel.usertools/code_check_full.bats
@@ -216,6 +216,8 @@ EOF
   bazel cquery \
     --experimental_cc_shared_library \
     --@local_config_cuda//:enable_cuda \
+    --repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
+    --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
     "somepath(//tensorflow/tools/pip_package:build_pip_package, " \
     "@local_config_cuda//cuda:cudart + "\
     "@local_config_cuda//cuda:cudart + "\
@@ -237,6 +239,8 @@ EOF
   bazel cquery \
     --experimental_cc_shared_library \
     --@local_config_cuda//:enable_cuda \
+    --repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
+    --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
     --define framework_shared_object=false \
     "somepath(//tensorflow/tools/pip_package:build_pip_package, " \
     "@local_config_cuda//cuda:cudart + "\

diff --git a/ci/official/utilities/code_check_full.bats b/ci/official/utilities/code_check_full.bats
@@ -215,6 +215,8 @@ EOF
   bazel cquery \
     --experimental_cc_shared_library \
     --@local_config_cuda//:enable_cuda \
+    --repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
+    --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
     "somepath(//tensorflow/tools/pip_package:wheel, " \
     "@local_config_cuda//cuda:cudart + "\
     "@local_config_cuda//cuda:cudart + "\
@@ -236,6 +238,8 @@ EOF
   bazel cquery \
     --experimental_cc_shared_library \
     --@local_config_cuda//:enable_cuda \
+    --repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
+    --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
     --define framework_shared_object=false \
     "somepath(//tensorflow/tools/pip_package:wheel, " \
     "@local_config_cuda//cuda:cudart + "\