[go: nahoru, domu]

Skip to content

Commit

Permalink
Implement hermetic cuda usage across TF projects.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 616865795
  • Loading branch information
tensorflower-gardener committed Apr 1, 2024
1 parent e7efc3a commit 519f7bd
Show file tree
Hide file tree
Showing 75 changed files with 4,322 additions and 488 deletions.
12 changes: 1 addition & 11 deletions .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,6 @@ build:cuda --@local_config_cuda//:enable_cuda
# CUDA: This config refers to building CUDA op kernels with clang.
build:cuda_clang --config=cuda
# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt
build:cuda_clang --config=tensorrt
build:cuda_clang --action_env=TF_CUDA_CLANG="1"
build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
# Select supported compute capabilities (supported graphics cards).
Expand All @@ -247,12 +246,10 @@ build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_8

# Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
build:cuda_clang_official --config=cuda_clang
build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
build:cuda_clang_official --action_env=TF_CUDA_VERSION="12.3"
build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"

# Build with nvcc for CUDA and clang for host
Expand Down Expand Up @@ -533,9 +530,7 @@ build:rbe_linux_cuda --config=rbe_linux_cpu
# For Remote build execution -- GPU configuration
build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.16-clang_config_cuda"
build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_config_tensorrt"
build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"

build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
build:rbe_linux_cuda_nvcc --config=nvcc_clang
Expand Down Expand Up @@ -633,7 +628,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/cla
# Test-related settings below this point.
test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
test:release_linux_base --local_test_jobs=HOST_CPUS
test:release_linux_base --test_env=LD_LIBRARY_PATH
# Give only the list of failed tests at the end of the log
test:release_linux_base --test_summary=short

Expand All @@ -645,7 +639,6 @@ build:release_gpu_linux --config=release_cpu_linux
# Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
# Note that linux cpu and cuda builds share the same toolchain now.
build:release_gpu_linux --config=cuda_clang_official
test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
# Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute

Expand Down Expand Up @@ -676,9 +669,6 @@ build:unsupported_gpu_linux --config=unsupported_cpu_linux
build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11"
build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8"
build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
build:unsupported_gpu_linux --config=tensorrt
build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib"
build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain

Expand Down
47 changes: 0 additions & 47 deletions ci/official/utilities/code_check_full.bats
Original file line number Diff line number Diff line change
Expand Up @@ -210,53 +210,6 @@ EOF
fi
}

# The Python package is not allowed to depend on any CUDA packages.
@test "Pip package doesn't depend on CUDA" {
bazel cquery \
--experimental_cc_shared_library \
--@local_config_cuda//:enable_cuda \
"somepath(//tensorflow/tools/pip_package:wheel, " \
"@local_config_cuda//cuda:cudart + "\
"@local_config_cuda//cuda:cudart + "\
"@local_config_cuda//cuda:cuda_driver + "\
"@local_config_cuda//cuda:cudnn + "\
"@local_config_cuda//cuda:curand + "\
"@local_config_cuda//cuda:cusolver + "\
"@local_config_tensorrt//:tensorrt)" --keep_going > $BATS_TEST_TMPDIR/out

cat <<EOF
There was a path found connecting //tensorflow/tools/pip_package:wheel
to a banned CUDA dependency. Here's the output from bazel query:
EOF
cat $BATS_TEST_TMPDIR/out
[[ ! -s $BATS_TEST_TMPDIR/out ]]
}

@test "Pip package doesn't depend on CUDA for static builds (i.e. Windows)" {
bazel cquery \
--experimental_cc_shared_library \
--@local_config_cuda//:enable_cuda \
--define framework_shared_object=false \
"somepath(//tensorflow/tools/pip_package:wheel, " \
"@local_config_cuda//cuda:cudart + "\
"@local_config_cuda//cuda:cudart + "\
"@local_config_cuda//cuda:cuda_driver + "\
"@local_config_cuda//cuda:cudnn + "\
"@local_config_cuda//cuda:curand + "\
"@local_config_cuda//cuda:cusolver + "\
"@local_config_tensorrt//:tensorrt)" --keep_going > $BATS_TEST_TMPDIR/out

cat <<EOF
There was a path found connecting //tensorflow/tools/pip_package:wheel
to a banned CUDA dependency when '--define framework_shared_object=false' is set.
This means that a CUDA target was probably included via an is_static condition,
used when targeting platforms like Windows where we build statically instead
of dynamically. Here's the output from bazel query:
EOF
cat $BATS_TEST_TMPDIR/out
[[ ! -s $BATS_TEST_TMPDIR/out ]]
}

@test "All tensorflow.org/code links point to real files" {
for i in $(grep -onI 'https://www.tensorflow.org/code/[a-zA-Z0-9/._-]\+' -r tensorflow); do
target=$(echo $i | sed 's!.*https://www.tensorflow.org/code/!!g')
Expand Down
4 changes: 3 additions & 1 deletion tensorflow/compiler/tests/build_defs.bzl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Build rules for Tensorflow/XLA testing."""

load("//tensorflow:strict.default.bzl", "py_strict_test")
load("//tensorflow:tensorflow.bzl", "py_test")
load("//tensorflow:tensorflow.bzl", "if_cuda_configured", "if_oss", "py_test")
load("//tensorflow/compiler/tests:plugin.bzl", "plugins")
load(
"//tensorflow/core/platform:build_config_root.bzl",
Expand Down Expand Up @@ -90,6 +90,8 @@ def tf_xla_py_test(
"--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_UINT8,DT_QUINT8,DT_INT8,DT_QINT8,DT_INT32,DT_QINT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_COMPLEX128,DT_BFLOAT16",
]
backend_tags += tf_cuda_tests_tags()
backend_data += if_oss(if_cuda_configured(["@cuda_nvcc//:ptxas", "@cuda_nvcc//:nvvm"]))

elif backend in plugins:
backend_args += [
"--test_device=" + plugins[backend]["device"],
Expand Down
4 changes: 4 additions & 0 deletions tensorflow/dtensor/build_defs.bzl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Helpers for defining multi-platform DTensor test targets."""

load("//tensorflow:strict.default.bzl", "py_strict_test")
load("//tensorflow:tensorflow.bzl", "if_cuda_configured", "if_oss")

# LINT.IfChange
ALL_BACKENDS = [
Expand Down Expand Up @@ -160,6 +161,9 @@ def dtensor_test(

all_tests = []
for config in configurations:
if config["suffix"] == "gpu":
data = data + if_oss(if_cuda_configured(["@cuda_nvcc//:nvvm"]))

config_name = name + "_" + config["suffix"]

all_tests.append(config_name)
Expand Down
19 changes: 19 additions & 0 deletions tensorflow/opensource_only.files
Original file line number Diff line number Diff line change
Expand Up @@ -232,22 +232,40 @@ tf_staging/third_party/googleapis/build_rules.bzl:
tf_staging/third_party/googleapis/googleapis.BUILD:
tf_staging/third_party/googleapis/repository_rules.bzl:
tf_staging/third_party/gpus/BUILD:
tf_staging/third_party/gpus/compiler_common_tools.bzl:
tf_staging/third_party/gpus/crosstool/BUILD.rocm.tpl:
tf_staging/third_party/gpus/crosstool/BUILD.tpl:
tf_staging/third_party/gpus/crosstool/BUILD:
tf_staging/third_party/gpus/crosstool/LICENSE:
tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl:
tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl:
tf_staging/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
tf_staging/third_party/gpus/cuda/BUILD.hermetic.tpl:
tf_staging/third_party/gpus/cuda/BUILD.tpl:
tf_staging/third_party/gpus/cuda/BUILD.windows.tpl:
tf_staging/third_party/gpus/cuda/BUILD:
tf_staging/third_party/gpus/cuda/LICENSE:
tf_staging/third_party/gpus/cuda/build_defs.bzl.tpl:
tf_staging/third_party/gpus/cuda/cuda_cccl.BUILD:
tf_staging/third_party/gpus/cuda/cuda_config.h.tpl:
tf_staging/third_party/gpus/cuda/cuda_config.py.tpl:
tf_staging/third_party/gpus/cuda/cuda_cublas.BUILD.tpl:
tf_staging/third_party/gpus/cuda/cuda_cudart.BUILD.tpl:
tf_staging/third_party/gpus/cuda/cuda_cudnn.BUILD.tpl:
tf_staging/third_party/gpus/cuda/cuda_cufft.BUILD.tpl:
tf_staging/third_party/gpus/cuda/cuda_cupti.BUILD.tpl:
tf_staging/third_party/gpus/cuda/cuda_curand.BUILD.tpl:
tf_staging/third_party/gpus/cuda/cuda_cusolver.BUILD.tpl:
tf_staging/third_party/gpus/cuda/cuda_cusparse.BUILD.tpl:
tf_staging/third_party/gpus/cuda/cuda_nccl.BUILD:
tf_staging/third_party/gpus/cuda/cuda_nvcc.BUILD:
tf_staging/third_party/gpus/cuda/cuda_nvjitlink.BUILD.tpl:
tf_staging/third_party/gpus/cuda/cuda_nvml.BUILD:
tf_staging/third_party/gpus/cuda/cuda_nvprune.BUILD:
tf_staging/third_party/gpus/cuda/cuda_nvtx.BUILD:
tf_staging/third_party/gpus/cuda_configure.bzl:
tf_staging/third_party/gpus/find_cuda_config:.py
tf_staging/third_party/gpus/hermetic_cuda_configure.bzl:
tf_staging/third_party/gpus/rocm/BUILD.tpl:
tf_staging/third_party/gpus/rocm/BUILD:
tf_staging/third_party/gpus/rocm/build_defs.bzl.tpl:
Expand Down Expand Up @@ -275,6 +293,7 @@ tf_staging/third_party/nccl/archive.BUILD:
tf_staging/third_party/nccl/archive.patch:
tf_staging/third_party/nccl/build_defs.bzl.tpl:
tf_staging/third_party/nccl/generated_names.bzl.tpl:
tf_staging/third_party/nccl/hermetic_nccl_configure.bzl:
tf_staging/third_party/nccl/nccl_configure.bzl:
tf_staging/third_party/nccl/system.BUILD.tpl:
tf_staging/third_party/nlohmann_json.BUILD:
Expand Down
5 changes: 5 additions & 0 deletions tensorflow/python/ops/numpy_ops/tests/BUILD
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
load("//tensorflow:tensorflow.bzl", "if_cuda_configured", "if_oss")

# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])

Expand Down Expand Up @@ -199,6 +200,10 @@ py_strict_test(
"--num_generated_cases=90",
"--enable_x64", # Needed to enable dtype check
],
data = if_oss(if_cuda_configured([
"@cuda_nvcc//:ptxas",
"@cuda_nvcc//:nvvm",
])),
python_version = "PY3",
shard_count = 20,
srcs_version = "PY2AND3",
Expand Down
6 changes: 5 additions & 1 deletion tensorflow/tensorflow.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ def if_google(google_value, oss_value = []):
"""
return oss_value # copybara:comment_replace return google_value

def if_cuda_configured(cuda_value, default_value = []):
return if_cuda(cuda_value, default_value)

def if_v2(a):
return select({
clean_dep("//tensorflow:api_version_2"): a,
Expand Down Expand Up @@ -802,7 +805,7 @@ def tf_cc_shared_object(
testonly = kwargs.pop("testonly", False)

for name_os, name_os_major, name_os_full in names:
# Windows DLLs cant be versioned
# Windows DLLs can't be versioned
if name_os.endswith(".dll"):
name_os_major = name_os
name_os_full = name_os
Expand Down Expand Up @@ -2744,6 +2747,7 @@ def gpu_py_test(
test_tags = tags
if config == "gpu":
test_tags = test_tags + tf_gpu_tests_tags()
data = data + if_oss(if_cuda(["@cuda_nvcc//:ptxas", "@cuda_nvcc//:nvvm"]))
if config == "2gpu":
test_tags = test_tags + two_gpu_tags
if "requires-gpu-nvidia" in test_tags:
Expand Down
26 changes: 26 additions & 0 deletions tensorflow/tools/pip_package/build_pip_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,32 @@ def prepare_headers(headers: list[str], srcs_dir: str) -> None:
srcs_dir: target directory where headers are copied to.
"""
path_to_exclude = [
"cuda_cccl/_virtual_includes",
"cuda_cublas/_virtual_includes",
"cuda_cudart/_virtual_includes",
"cuda_cudnn/_virtual_includes",
"cuda_cufft/_virtual_includes",
"cuda_cupti/_virtual_includes",
"cuda_curand/_virtual_includes",
"cuda_cusolver/_virtual_includes",
"cuda_cusparse/_virtual_includes",
"cuda_nvcc/_virtual_includes",
"cuda_nvjitlink/_virtual_includes",
"cuda_nvml/_virtual_includes",
"cuda_nvtx/_virtual_includes",
"external/cuda_cccl",
"external/cuda_cublas",
"external/cuda_cudart",
"external/cuda_cudnn",
"external/cuda_cufft",
"external/cuda_cupti",
"external/cuda_curand",
"external/cuda_cusolver",
"external/cuda_cusparse",
"external/cuda_nvcc",
"external/cuda_nvjitlink",
"external/cuda_nvml",
"external/cuda_nvtx",
"external/pypi",
"external/jsoncpp_git/src",
"local_config_cuda/cuda/_virtual_includes",
Expand Down
8 changes: 2 additions & 6 deletions tensorflow/tools/toolchains/remote_config/configs.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -674,16 +674,14 @@ def initialize_rbe_configs():
"HOST_CXX_COMPILER": "/dt9/usr/bin/gcc",
"HOST_C_COMPILER": "/dt9/usr/bin/gcc",
"PYTHON_BIN_PATH": "/usr/bin/python3",
"TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
"TF_CUDA_CLANG": "0",
"TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
"TF_CUDA_VERSION": "12.3",
"TF_CUDNN_VERSION": "8.9",
"TF_ENABLE_XLA": "1",
"TF_NEED_CUDA": "1",
"TF_NEED_TENSORRT": "1",
"TF_NEED_TENSORRT": "0",
"TF_SYSROOT": "/dt9",
"TF_TENSORRT_VERSION": "8.6",
},
)

Expand Down Expand Up @@ -713,15 +711,13 @@ def initialize_rbe_configs():
"HOST_CXX_COMPILER": "/usr/lib/llvm-17/bin/clang",
"HOST_C_COMPILER": "/usr/lib/llvm-17/bin/clang",
"PYTHON_BIN_PATH": "/usr/bin/python3",
"TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
"TF_CUDA_CLANG": "1",
"TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
"TF_CUDA_VERSION": "12.3",
"TF_CUDNN_VERSION": "8.9",
"TF_ENABLE_XLA": "1",
"TF_NEED_CUDA": "1",
"TF_NEED_TENSORRT": "1",
"TF_NEED_TENSORRT": "0",
"TF_SYSROOT": "/dt9",
"TF_TENSORRT_VERSION": "8.6",
},
)
15 changes: 7 additions & 8 deletions tensorflow/tools/toolchains/remote_config/rbe_config.bzl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""Macro that creates external repositories for remote config."""

load("//tensorflow/tools/toolchains/remote_config:containers.bzl", "containers")
load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
load("//third_party/gpus:hermetic_cuda_configure.bzl", "hermetic_cuda_configure")
load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
load("//third_party/nccl:hermetic_nccl_configure.bzl", "hermetic_nccl_configure")
load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
Expand Down Expand Up @@ -41,8 +41,7 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
"TF_ENABLE_XLA": "1",
"TF_CUDNN_VERSION": cudnn_version,
"TF_CUDA_VERSION": cuda_version,
"CUDNN_INSTALL_PATH": cudnn_install_path if cudnn_install_path != None else "/usr/lib/x86_64-linux-gnu",
"TF_NEED_TENSORRT": "1",
"TF_NEED_TENSORRT": "0",
"TF_TENSORRT_VERSION": tensorrt_version if tensorrt_version != None else "",
"TENSORRT_INSTALL_PATH": tensorrt_install_path if tensorrt_install_path != None else "/usr/lib/x86_64-linux-gnu",
"GCC_HOST_COMPILER_PATH": compiler if not compiler.endswith("clang") else "",
Expand All @@ -58,13 +57,13 @@ def _tensorflow_rbe_config(name, compiler, python_versions, os, rocm_version = N
"Pool": "default",
}

remote_cuda_configure(
hermetic_cuda_configure(
name = "%s_config_cuda" % name,
environ = env,
exec_properties = exec_properties,
)

remote_nccl_configure(
hermetic_nccl_configure(
name = "%s_config_nccl" % name,
environ = env,
exec_properties = exec_properties,
Expand Down Expand Up @@ -175,13 +174,13 @@ def sigbuild_tf_configs(name_container_map, env):
"Pool": "default",
}

remote_cuda_configure(
hermetic_cuda_configure(
name = "%s_config_cuda" % name,
environ = env,
exec_properties = exec_properties,
)

remote_nccl_configure(
hermetic_nccl_configure(
name = "%s_config_nccl" % name,
environ = env,
exec_properties = exec_properties,
Expand Down
Loading

0 comments on commit 519f7bd

Please sign in to comment.