[go: nahoru, domu]

Skip to content

Commit

Permalink
Implement hermetic cuda usage across TF projects.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 616865795
  • Loading branch information
tensorflower-gardener committed Apr 10, 2024
1 parent 3e92b6f commit 242d136
Show file tree
Hide file tree
Showing 141 changed files with 5,126 additions and 952 deletions.
14 changes: 3 additions & 11 deletions .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -228,13 +228,14 @@ build:mkl_aarch64_threadpool -c opt
build:cuda --repo_env TF_NEED_CUDA=1
build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
build:cuda --@local_config_cuda//:enable_cuda
build:cuda --@local_config_cuda//:enable_hermetic_cuda

# CUDA: This config refers to building CUDA op kernels with clang.
build:cuda_clang --config=cuda
# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt
build:cuda_clang --config=tensorrt
build:cuda_clang --action_env=TF_CUDA_CLANG="1"
build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
build:cuda_clang --copt=-Qunused-arguments
# Select supported compute capabilities (supported graphics cards).
# This is the same as the official TensorFlow builds.
# See https://developer.nvidia.com/cuda-gpus#compute
Expand All @@ -247,12 +248,10 @@ build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_60,sm_70,sm_80,sm_8

# Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
build:cuda_clang_official --config=cuda_clang
build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
build:cuda_clang_official --action_env=TF_CUDA_VERSION="12.3"
build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"

# Build with nvcc for CUDA and clang for host
Expand Down Expand Up @@ -533,9 +532,7 @@ build:rbe_linux_cuda --config=rbe_linux_cpu
# For Remote build execution -- GPU configuration
build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"

build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
build:rbe_linux_cuda_nvcc --config=nvcc_clang
Expand Down Expand Up @@ -633,7 +630,6 @@ build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/cla
# Test-related settings below this point.
test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
test:release_linux_base --local_test_jobs=HOST_CPUS
test:release_linux_base --test_env=LD_LIBRARY_PATH
# Give only the list of failed tests at the end of the log
test:release_linux_base --test_summary=short

Expand All @@ -645,7 +641,6 @@ build:release_gpu_linux --config=release_cpu_linux
# Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
# Note that linux cpu and cuda builds share the same toolchain now.
build:release_gpu_linux --config=cuda_clang_official
test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
# Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute

Expand Down Expand Up @@ -676,9 +671,6 @@ build:unsupported_gpu_linux --config=unsupported_cpu_linux
build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11"
build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8"
build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
build:unsupported_gpu_linux --config=tensorrt
build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib"
build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain

Expand Down
47 changes: 0 additions & 47 deletions ci/official/utilities/code_check_full.bats
Original file line number Diff line number Diff line change
Expand Up @@ -210,53 +210,6 @@ EOF
fi
}

# The Python package is not allowed to depend on any CUDA packages.
@test "Pip package doesn't depend on CUDA" {
bazel cquery \
--experimental_cc_shared_library \
--@local_config_cuda//:enable_cuda \
"somepath(//tensorflow/tools/pip_package:wheel, " \
"@local_config_cuda//cuda:cudart + "\
"@local_config_cuda//cuda:cudart + "\
"@local_config_cuda//cuda:cuda_driver + "\
"@local_config_cuda//cuda:cudnn + "\
"@local_config_cuda//cuda:curand + "\
"@local_config_cuda//cuda:cusolver + "\
"@local_config_tensorrt//:tensorrt)" --keep_going > $BATS_TEST_TMPDIR/out

cat <<EOF
There was a path found connecting //tensorflow/tools/pip_package:wheel
to a banned CUDA dependency. Here's the output from bazel query:
EOF
cat $BATS_TEST_TMPDIR/out
[[ ! -s $BATS_TEST_TMPDIR/out ]]
}

@test "Pip package doesn't depend on CUDA for static builds (i.e. Windows)" {
bazel cquery \
--experimental_cc_shared_library \
--@local_config_cuda//:enable_cuda \
--define framework_shared_object=false \
"somepath(//tensorflow/tools/pip_package:wheel, " \
"@local_config_cuda//cuda:cudart + "\
"@local_config_cuda//cuda:cudart + "\
"@local_config_cuda//cuda:cuda_driver + "\
"@local_config_cuda//cuda:cudnn + "\
"@local_config_cuda//cuda:curand + "\
"@local_config_cuda//cuda:cusolver + "\
"@local_config_tensorrt//:tensorrt)" --keep_going > $BATS_TEST_TMPDIR/out

cat <<EOF
There was a path found connecting //tensorflow/tools/pip_package:wheel
to a banned CUDA dependency when '--define framework_shared_object=false' is set.
This means that a CUDA target was probably included via an is_static condition,
used when targeting platforms like Windows where we build statically instead
of dynamically. Here's the output from bazel query:
EOF
cat $BATS_TEST_TMPDIR/out
[[ ! -s $BATS_TEST_TMPDIR/out ]]
}

@test "All tensorflow.org/code links point to real files" {
for i in $(grep -onI 'https://www.tensorflow.org/code/[a-zA-Z0-9/._-]\+' -r tensorflow); do
target=$(echo $i | sed 's!.*https://www.tensorflow.org/code/!!g')
Expand Down
215 changes: 2 additions & 213 deletions configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

import argparse
import errno
import glob
import json
import os
import platform
Expand Down Expand Up @@ -239,7 +238,7 @@ def setup_python(environ_cp):
write_to_bazelrc('build --python_path=\"{}"'.format(python_bin_path))
environ_cp['PYTHON_BIN_PATH'] = python_bin_path

# If choosen python_lib_path is from a path specified in the PYTHONPATH
# If chosen python_lib_path is from a path specified in the PYTHONPATH
# variable, need to tell bazel to include PYTHONPATH
if environ_cp.get('PYTHONPATH'):
python_paths = environ_cp.get('PYTHONPATH').split(':')
Expand Down Expand Up @@ -775,11 +774,6 @@ def get_ndk_api_level(environ_cp, android_ndk_home_path):
def set_gcc_host_compiler_path(environ_cp):
"""Set GCC_HOST_COMPILER_PATH."""
default_gcc_host_compiler_path = which('gcc') or ''
cuda_bin_symlink = '%s/bin/gcc' % environ_cp.get('CUDA_TOOLKIT_PATH')

if os.path.islink(cuda_bin_symlink):
# os.readlink is only available in linux
default_gcc_host_compiler_path = os.path.realpath(cuda_bin_symlink)

gcc_host_compiler_path = prompt_loop_or_load_from_env(
environ_cp,
Expand Down Expand Up @@ -937,17 +931,6 @@ def disable_clang_offsetof_extension(clang_version):
write_to_bazelrc('build --copt=-Wno-gnu-offsetof-extensions')


def set_tf_cuda_paths(environ_cp):
"""Set TF_CUDA_PATHS."""
ask_cuda_paths = (
'Please specify the comma-separated list of base paths to look for CUDA '
'libraries and headers. [Leave empty to use the default]: ')
tf_cuda_paths = get_from_env_or_user_or_default(environ_cp, 'TF_CUDA_PATHS',
ask_cuda_paths, '')
if tf_cuda_paths:
environ_cp['TF_CUDA_PATHS'] = tf_cuda_paths


def set_tf_cuda_version(environ_cp):
"""Set TF_CUDA_VERSION."""
ask_cuda_version = (
Expand All @@ -960,85 +943,10 @@ def set_tf_cuda_version(environ_cp):
environ_cp['TF_CUDA_VERSION'] = tf_cuda_version


def set_tf_cudnn_version(environ_cp):
"""Set TF_CUDNN_VERSION."""
ask_cudnn_version = (
'Please specify the cuDNN version you want to use. '
'[Leave empty to default to cuDNN %s]: ') % _DEFAULT_CUDNN_VERSION
tf_cudnn_version = get_from_env_or_user_or_default(environ_cp,
'TF_CUDNN_VERSION',
ask_cudnn_version,
_DEFAULT_CUDNN_VERSION)
environ_cp['TF_CUDNN_VERSION'] = tf_cudnn_version


def set_tf_tensorrt_version(environ_cp):
"""Set TF_TENSORRT_VERSION."""
if not (is_linux() or is_windows()):
raise ValueError('Currently TensorRT is only supported on Linux platform.')

if not int(environ_cp.get('TF_NEED_TENSORRT', False)):
return

ask_tensorrt_version = (
'Please specify the TensorRT version you want to use. '
'[Leave empty to default to TensorRT %s]: ') % _DEFAULT_TENSORRT_VERSION
tf_tensorrt_version = get_from_env_or_user_or_default(
environ_cp, 'TF_TENSORRT_VERSION', ask_tensorrt_version,
_DEFAULT_TENSORRT_VERSION)
environ_cp['TF_TENSORRT_VERSION'] = tf_tensorrt_version


def set_tf_nccl_version(environ_cp):
"""Set TF_NCCL_VERSION."""
if not is_linux():
raise ValueError('Currently NCCL is only supported on Linux platform.')

if 'TF_NCCL_VERSION' in environ_cp:
return

ask_nccl_version = (
'Please specify the locally installed NCCL version you want to use. '
'[Leave empty to use http://github.com/nvidia/nccl]: ')
tf_nccl_version = get_from_env_or_user_or_default(environ_cp,
'TF_NCCL_VERSION',
ask_nccl_version, '')
environ_cp['TF_NCCL_VERSION'] = tf_nccl_version


def get_native_cuda_compute_capabilities(environ_cp):
"""Get native cuda compute capabilities.
Args:
environ_cp: copy of the os.environ.
Returns:
string of native cuda compute capabilities, separated by comma.
"""
device_query_bin = os.path.join(
environ_cp.get('CUDA_TOOLKIT_PATH'), 'extras/demo_suite/deviceQuery')
if os.path.isfile(device_query_bin) and os.access(device_query_bin, os.X_OK):
try:
output = run_shell(device_query_bin).split('\n')
pattern = re.compile('[0-9]*\\.[0-9]*')
output = [pattern.search(x) for x in output if 'Capability' in x]
output = ','.join(x.group() for x in output if x is not None)
except subprocess.CalledProcessError:
output = ''
else:
output = ''
return output


def set_tf_cuda_compute_capabilities(environ_cp):
"""Set TF_CUDA_COMPUTE_CAPABILITIES."""
while True:
native_cuda_compute_capabilities = get_native_cuda_compute_capabilities(
environ_cp)
if not native_cuda_compute_capabilities:
default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES
else:
default_cuda_compute_capabilities = native_cuda_compute_capabilities
default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES

ask_cuda_compute_capabilities = (
'Please specify a list of comma-separated CUDA compute capabilities '
Expand Down Expand Up @@ -1217,73 +1125,6 @@ def configure_ios(environ_cp):
symlink_force(filepath, new_filepath)


def validate_cuda_config(environ_cp):
"""Run find_cuda_config.py and return cuda_toolkit_path, or None."""

def maybe_encode_env(env):
"""Encodes unicode in env to str on Windows python 2.x."""
if not is_windows() or sys.version_info[0] != 2:
return env
for k, v in env.items():
if isinstance(k, unicode):
k = k.encode('ascii')
if isinstance(v, unicode):
v = v.encode('ascii')
env[k] = v
return env

cuda_libraries = ['cuda', 'cudnn']
if is_linux():
if int(environ_cp.get('TF_NEED_TENSORRT', False)):
cuda_libraries.append('tensorrt')
if environ_cp.get('TF_NCCL_VERSION', None):
cuda_libraries.append('nccl')
if is_windows():
if int(environ_cp.get('TF_NEED_TENSORRT', False)):
cuda_libraries.append('tensorrt')
print('WARNING: TensorRT support on Windows is experimental\n')

paths = glob.glob('**/third_party/gpus/find_cuda_config.py', recursive=True)
if not paths:
raise FileNotFoundError(
"Can't find 'find_cuda_config.py' script inside working directory")
proc = subprocess.Popen(
[environ_cp['PYTHON_BIN_PATH'], paths[0]] + cuda_libraries,
stdout=subprocess.PIPE,
env=maybe_encode_env(environ_cp))

if proc.wait():
# Errors from find_cuda_config.py were sent to stderr.
print('Asking for detailed CUDA configuration...\n')
return False

config = dict(
tuple(line.decode('ascii').rstrip().split(': ')) for line in proc.stdout)

print('Found CUDA %s in:' % config['cuda_version'])
print(' %s' % config['cuda_library_dir'])
print(' %s' % config['cuda_include_dir'])

print('Found cuDNN %s in:' % config['cudnn_version'])
print(' %s' % config['cudnn_library_dir'])
print(' %s' % config['cudnn_include_dir'])

if 'tensorrt_version' in config:
print('Found TensorRT %s in:' % config['tensorrt_version'])
print(' %s' % config['tensorrt_library_dir'])
print(' %s' % config['tensorrt_include_dir'])

if config.get('nccl_version', None):
print('Found NCCL %s in:' % config['nccl_version'])
print(' %s' % config['nccl_library_dir'])
print(' %s' % config['nccl_include_dir'])

print('\n')

environ_cp['CUDA_TOOLKIT_PATH'] = config['cuda_toolkit_path']
return True


def get_gcc_compiler(environ_cp):
gcc_env = environ_cp.get('CXX') or environ_cp.get('CC') or which('gcc')
if gcc_env is not None:
Expand Down Expand Up @@ -1388,58 +1229,6 @@ def main():
if (environ_cp.get('TF_NEED_CUDA') == '1' and
'TF_CUDA_CONFIG_REPO' not in environ_cp):

set_action_env_var(
environ_cp,
'TF_NEED_TENSORRT',
'TensorRT',
False,
bazel_config_name='tensorrt')

environ_save = dict(environ_cp)
for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):

if validate_cuda_config(environ_cp):
cuda_env_names = [
'TF_CUDA_VERSION',
'TF_CUBLAS_VERSION',
'TF_CUDNN_VERSION',
'TF_TENSORRT_VERSION',
'TF_NCCL_VERSION',
'TF_CUDA_PATHS',
# Items below are for backwards compatibility when not using
# TF_CUDA_PATHS.
'CUDA_TOOLKIT_PATH',
'CUDNN_INSTALL_PATH',
'NCCL_INSTALL_PATH',
'NCCL_HDR_PATH',
'TENSORRT_INSTALL_PATH'
]
# Note: set_action_env_var above already writes to bazelrc.
for name in cuda_env_names:
if name in environ_cp:
write_action_env_to_bazelrc(name, environ_cp[name])
break

# Restore settings changed below if CUDA config could not be validated.
environ_cp = dict(environ_save)

set_tf_cuda_version(environ_cp)
set_tf_cudnn_version(environ_cp)
if is_windows():
set_tf_tensorrt_version(environ_cp)
if is_linux():
set_tf_tensorrt_version(environ_cp)
set_tf_nccl_version(environ_cp)

set_tf_cuda_paths(environ_cp)

else:
raise UserInputError(
'Invalid CUDA setting were provided %d '
'times in a row. Assuming to be a scripting mistake.'
% _DEFAULT_PROMPT_ASK_ATTEMPTS
)

set_tf_cuda_compute_capabilities(environ_cp)
if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
'LD_LIBRARY_PATH') != '1':
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/compiler/aot/embedded_protocol_buffers.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ limitations under the License.

namespace tensorflow {
namespace tfcompile {
using xla::StatusOr;
using absl::StatusOr;

// Represents a set of protocol buffers embedded into an object file and
// describes how to access them at runtime.
Expand Down
Loading

0 comments on commit 242d136

Please sign in to comment.