You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Have you reproduced the bug with TensorFlow Nightly?
No
Source
source
TensorFlow version
2.0.0
Custom code
Yes
OS platform and distribution
Ubuntu 20.04.4 LTS
Mobile device
No response
Python version
3.6.5
Bazel version
No response
GCC/compiler version
No response
CUDA/cuDNN version
CUDA Version: 12.2
GPU model and memory
No response
Current behavior?
When I tried to train model with the Poisson loss function and the Adam optimizer,the loss value of the model became NaN.
The model file and test data are attached here.In order to simplify the process of identifying the bug, I have only included a portion of the code for the forward and backward propagation. 000116.zip
Standalone code to reproduce the issue
# trainer.py
from typing import List, Optional
import numpy as np
import random
from pathlib import Path
import subprocess
import platform
class CmdProcess(object):
def __init__(self, cmd: str):
super().__init__()
self.__cmd = cmd
self.__result = None
self.__sys = platform.system()
def run(self, timeout):
p = subprocess.Popen(self.__cmd, shell=True)
try:
return p.wait(timeout)
except subprocess.TimeoutExpired:
p.kill()
return -1
def get_result(self):
return self.__result if self.__sys == "Windows"else (self.__result >> 8)
losses = [
'mean_squared_error',
'mean_absolute_error',
'mean_absolute_percentage_error',
'mean_squared_logarithmic_error',
'squared_hinge',
'categorical_hinge',
'logcosh',
'huber_loss',
'categorical_crossentropy',
'binary_crossentropy',
'kullback_leibler_divergence',
'poisson',
'cosine_proximity',
]
optimizers = [
'sgd',
'rmsprop',
'adagrad',
'adadelta',
'adam',
'adamax',
'nadam',
]
class Trainer(object):
def __init__(self, timeout=1000):
super().__init__()
self.__timeout = timeout
def train(self, model_id: int, exp_dir: str, ok_backends: List[str], loss: Optional[str] = None,
optimizer: Optional[str] = None):
model_dir = Path(exp_dir) / 'models'
training_inputs_path = Path(exp_dir) / 'dataset' / 'inputs.npz'
ground_truths_path = Path(exp_dir) / 'dataset' / 'ground_truths.npz'
outputs_dir = Path(exp_dir) / 'layer_outputs'
loss_dir = Path(exp_dir) / 'loss'
loss_grads_dir = Path(exp_dir) / 'loss_gradients'
gradients_dir = Path(exp_dir) / 'layer_gradients'# weights_dir = Path(exp_dir) / 'layer_weights'
outputs_dir.mkdir(parents=True, exist_ok=True)
loss_dir.mkdir(parents=True, exist_ok=True)
loss_grads_dir.mkdir(parents=True, exist_ok=True)
gradients_dir.mkdir(parents=True, exist_ok=True)
if loss is None:
loss = random.choice(losses)
if optimizer is None:
optimizer = random.choice(optimizers)
crash_backends, nan_backends, inf_backends = [], [], []
backends_outputs, backends_losses, backends_loss_grads, backends_grads, backends_weights = {}, {}, {}, {}, {}
cmd_processes = {
bk: CmdProcess(f"/root/anaconda3/envs/{bk}/bin/python -m train"
f" --backend {bk}"
f" --loss {loss}"
f" --optimizer {optimizer}"
f" --model_path {str(model_dir / f'{bk}.h5')}"
f" --model_info_path {str(model_dir / 'model.json')}"
f" --training_instances_path {str(training_inputs_path)}"
f" --ground_truths_path {str(ground_truths_path)}"
f" --outputs_dir {str(outputs_dir / bk)}"
f" --loss_path {str(loss_dir / f'{bk}.txt')}"
f" --loss_grads_dir {str(loss_grads_dir / bk)}"
f" --gradients_dir {str(gradients_dir / bk)}")
# f" --weights_dir {str(weights_dir / bk)}")forbkin ok_backends
}
status = {}
forbk, pincmd_processes.items():
extract_status = p.run(self.__timeout)
print(f"{bk}_status: {extract_status}")
status[bk] = extract_status
if extract_status and extract_status in [255, 1, -1]:
crash_backends.append(bk)
else:
outputs_data = loss_value = loss_grads_data = grads_data = None # weights_data = Noneif extract_status == 0 or extract_status >= 2:
outputs_data = {fn.stem: np.load(str(fn)) forfnin (outputs_dir / bk).glob("*.npy")}
if extract_status == 0 or extract_status >= 3:
with open(str(loss_dir / f'{bk}.txt'), 'r') as f:
loss_value = float(f.read())
if extract_status == 0 or extract_status >= 4:
loss_grads_data = {fn.stem: np.load(str(fn)) forfnin (loss_grads_dir / bk).glob("*.npy")}
if extract_status == 0 or extract_status >= 5:
grads_data = {fn.stem: np.load(str(fn)) forfnin (gradients_dir / bk).glob("*.npy")}
# if extract_status == 0 or extract_status >= 6:# weights_data = {fn.stem: np.load(str(fn)) for fn in (weights_dir / bk).glob("*.npy")}
backends_outputs[bk] = outputs_data
backends_losses[bk] = loss_value
backends_loss_grads[bk] = loss_grads_data
backends_grads[bk] = grads_data
# backends_weights[bk] = weights_dataif extract_status == 0 or (extract_status != 255 and extract_status >= 2):
if self.__check(outputs_data, np.isnan):
nan_backends.append(bk)
if self.__check(outputs_data, np.isinf):
inf_backends.append(bk)
return status, backends_outputs, backends_losses, backends_loss_grads, backends_grads, [bk forbkin ok_backends
if
bk not in crash_backends], loss, optimizer
def __check(self, weights, f):
forwinweights.values():
iff(w).any():
return True
return False
if __name__ == '__main__':
tr = Trainer(timeout=1000)
model_id = 116
exp_dir = './dataset/000116'
ok_backends = ['tensorflow']
loss = 'poisson'
optimizer = 'adam'
print("---------------Train---------------")
status, backends_outputs, backends_losses, backends_loss_grads, backends_grads, ok_backends, loss, optimizer = tr.train(
model_id=model_id,
exp_dir=exp_dir,
ok_backends=ok_backends,
loss=loss,
optimizer=optimizer)
print("---------------Finish---------------")
print("---------------Calculate---------------")
from calculate import Comparator
cmp = Comparator()
is_nan = cmp.compare(backends_outputs=backends_outputs,
backends_losses=backends_losses,
backends_loss_grads=backends_loss_grads,
backends_grads=backends_grads,
ok_backends=ok_backends)
if is_nan:
print("loss:", loss)
print("optimizer:", optimizer)
print("---------------Finish---------------")
######################################################## train.py
import argparse
import sys
import json
from pathlib import Path
import numpy as np
import warnings
import os
warnings.filterwarnings("ignore")
use_gpu = True
def switch_backend(bk):
os.environ['KMP_WARNINGS'] = '0'
os.environ['KERAS_BACKEND'] = bk
# import keras.backend as K# K.set_image_data_format("channels_last")if bk == 'tensorflow':
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'# 只显示 Error
import tensorflow as tf
# tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
print(f"tensorflow version: {tf.__version__}")
if use_gpu:
if str(tf.__version__)[0] == '1':
from tensorflow.compat.v1 import GPUOptions
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import Session
from keras.backend.tensorflow_backend import set_session
gpu_options = GPUOptions(allow_growth=True)
set_session(Session(config=ConfigProto(gpu_options=gpu_options)))
else:
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
forgpuin gpus:
tf.config.experimental.set_memory_growth(gpu, True)
elif bk == 'theano':
os.environ['THEANO_FLAGS'] = "device=cuda,contexts=dev0->cuda0," \
f"force_device=True,floatX=float32,lib.cnmem=1"
import theano as th
print(f"theano version: {th.__version__}")
else:
from cntk.device import try_set_default_device, gpu
# try_set_default_device(gpu(0))
import cntk as ck
print(f"cntk version: {ck.__version__}")
def __prepare(loss_type: str, optimizer_type: str, training_instances_path: str, ground_truths_path: str,
model_path: str, model_info_path: str):
model = keras.models.load_model(model_path)
with open(model_info_path, "r") as f:
model_info = json.load(f)
input_objects_names = [model_info['model_structure'][str(idx)]['args']['name'] foridxin
model_info["input_id_list"]]
output_layers_names = [model_info['model_structure'][str(idx)]['args']['name'] foridxin
model_info["output_id_list"]]
tmp = np.load(training_instances_path)
training_instances = [*tmp.values()]
ground_truths = [*np.load(ground_truths_path).values()]
model.compile(loss=loss_type, optimizer=optimizer_type)
ins = model._feed_inputs + model._feed_targets + model._feed_sample_weights
x, y, sample_weight = model._standardize_user_data(training_instances, ground_truths)
ins_value = x + y + sample_weight
return model, input_objects_names, output_layers_names, training_instances, ground_truths, ins, ins_value
def __get_outputs(model, input_objects_names, x,
output_dir: str):
get_layer_output = K.function(model._feed_inputs + [K.learning_phase()],
[layer.output forlayerin model.layers if layer.name not in input_objects_names])
layers_names = [layer.name forlayerin model.layers if layer.name not in input_objects_names]
layers_outputs = get_layer_output(x + [1])
def save_outputs(layers_names, layers_outputs, output_dir):
forname, outputin zip(layers_names, layers_outputs):
save_path = Path(output_dir) / f'{name}.npy'
np.save(save_path, output)
Path(output_dir).mkdir(parents=True, exist_ok=True)
save_outputs(input_objects_names + layers_names, x + layers_outputs, output_dir)
return {layer_name: output forlayer_name, outputin zip(input_objects_names + layers_names, x + layers_outputs)}
def __get_loss(model, ins, ins_value,
loss_path: str):
get_loss = K.function(
ins + [K.learning_phase()],
[model.total_loss]
)
loss_value = get_loss(ins_value + [1])[0]
def save_loss(loss, output_path):
with open(output_path, 'w') as f:
f.write(str(loss))
save_loss(loss_value, loss_path)
def __get_loss_gradients(model, output_layers_names, ins, ins_value, layers_outputs_value, y,
loss_grads_dir: str, model_info_path: str):
layer_outputs = [model.get_layer(layer_name).output forlayer_namein output_layers_names]
ifK.backend() == 'cntk':
loss_grads_value = [__cntk_get_gradients(model, layer_name, layers_outputs_value, y, model_info_path) for
layer_name in output_layers_names]
else:
get_loss_grads = K.function(
ins + [K.learning_phase()],
K.gradients(model.total_loss, layer_outputs)
)
loss_grads_value = get_loss_grads(ins_value + [1])
def save_loss_grad(layer_names, grads_value, output_dir):
forlayer_name, gin zip(layer_names, grads_value):
save_path = Path(output_dir) / f'{layer_name}.npy'
np.save(save_path, g)
Path(loss_grads_dir).mkdir(parents=True, exist_ok=True)
save_loss_grad(output_layers_names, loss_grads_value, loss_grads_dir)
def __get_gradients(model, input_objects_names, ins, ins_value, layers_outputs_value, y,
grads_dir: str, model_info_path: str):
layer_names = input_objects_names + [layer.name forlayerin model.layers if layer.name not in input_objects_names]
ifK.backend() == 'cntk':
grads = [__cntk_get_gradients(model, layer_name, layers_outputs_value, y, model_info_path) forlayer_namein
layer_names]
else:
layer_outputs = model.inputs + [layer.output forlayerin model.layers if layer.name not in input_objects_names]
get_gradients = K.function(
ins + [K.learning_phase()],
K.gradients(model.total_loss, layer_outputs)
)
grads = get_gradients(ins_value + [1])
def save_gradients(layer_names, grads, output_dir):
forname, gin zip(layer_names, grads):
save_path = Path(output_dir) / f'{name}.npy'
np.save(save_path, g)
Path(grads_dir).mkdir(parents=True, exist_ok=True)
save_gradients(layer_names, grads, grads_dir)
def __cntk_get_gradients(model, layer_name, layer_outputs_value, y_true_list, model_info_path):
with open(str(model_info_path), "r") as f:
model_info = json.load(f)
import cntk as C
_output = C.input_variable(model.get_layer(layer_name).output_shape[1:], needs_gradient=True)
tmp_input = keras.Input(tensor=_output)
tmp_inputs = []
extra_inputs = []
extra_input_datas = []
layer_outputs = {}
def get_inbound_layers(layer):
ids = model_info['model_structure'][str(int(layer.name[:2]))]['pre_layers']
names = [model_info['model_structure'][str(idx)]['args']['name'] foridxin ids]
return [model.get_layer(name) fornamein names]
def get_output_of_layer(layer):
if layer.name in layer_outputs:
return layer_outputs[layer.name]
if layer.name == layer_name:
tmp_inputs.append(tmp_input)
layer_outputs[layer.name] = tmp_input
return tmp_input
if int(layer.name[:2]) < int(layer_name[:2]):
_input = C.input_variable(layer.output_shape[1:], needs_gradient=False)
tmp = keras.Input(tensor=_input)
extra_inputs.append(_input)
extra_input_datas.append(layer_outputs_value[layer.name])
tmp_inputs.append(tmp)
layer_outputs[layer.name] = tmp
return tmp
inbound_layers = get_inbound_layers(layer)
layer_ins = [get_output_of_layer(layer) forlayerin inbound_layers]
out = layer(layer_ins[0] if len(layer_ins) == 1 else layer_ins)
layer_outputs[layer.name] = out
return out
tmp_outputs = []
foroutput_idin model_info['output_id_list']:
name = model_info['model_structure'][str(output_id)]['args']['name']
tmp_outputs.append(get_output_of_layer(model.get_layer(name)))
tmp_model = keras.models.Model(inputs=tmp_inputs, outputs=tmp_outputs)
x = [layer_outputs_value[layer_name], *extra_input_datas]
y_true = y_true_list
tmp_model.compile(loss=model.loss, optimizer=model.optimizer)
ins = [_output, *extra_inputs] + tmp_model._feed_targets + tmp_model._feed_sample_weights
_, y, sample_weight = tmp_model._standardize_user_data([], y_true)
ins_value = x + y + sample_weight
grads = tmp_model.total_loss.grad({
k: v
fork, vin zip(ins, ins_value)
}, [_output])
return grads
if __name__ == "__main__":
parse = argparse.ArgumentParser()
parse.add_argument("--backend", type=str)
parse.add_argument("--loss", type=str)
parse.add_argument("--optimizer", type=str)
parse.add_argument("--model_path", type=str)
parse.add_argument("--model_info_path", type=str)
parse.add_argument("--training_instances_path", type=str)
parse.add_argument("--ground_truths_path", type=str)
parse.add_argument("--outputs_dir", type=str)
parse.add_argument("--loss_path", type=str)
parse.add_argument("--loss_grads_dir", type=str)
parse.add_argument("--gradients_dir", type=str)
flags, _ = parse.parse_known_args(sys.argv[1:])
FLAG = 0
try:
switch_backend(flags.backend)
import keras
from keras import backend as K
FLAG = -1
model, input_objects_names, output_layers_names, x, y, ins, ins_value = __prepare(flags.loss, flags.optimizer,
flags.training_instances_path,
flags.ground_truths_path,
flags.model_path,
flags.model_info_path)
FLAG = 1
layers_outputs_value = __get_outputs(model, input_objects_names, x, flags.outputs_dir)
FLAG = 2
__get_loss(model, ins, ins_value, flags.loss_path)
FLAG = 3
__get_loss_gradients(model, output_layers_names, ins, ins_value, layers_outputs_value, y, flags.loss_grads_dir,
flags.model_info_path)
FLAG = 4
__get_gradients(model, input_objects_names, ins, ins_value, layers_outputs_value, y, flags.gradients_dir,
flags.model_info_path)
FLAG = 5
# __get_weights(model, x, y, flags.weights_dir)# FLAG = 6ifK.backend() in ['tensorflow', 'cntk']:
K.clear_session()
except Exception:
import traceback
log_dir = Path(flags.outputs_dir).parent.parent / 'logs'
log_dir.mkdir(parents=True, exist_ok=True)
with (log_dir / 'detection.log').open(mode='a', encoding='utf-8') as f:
f.write(f"[ERROR] Crash when training model with {flags.backend}\n")
traceback.print_exc(file=f)
f.write("\n\n")
sys.exit(FLAG)
########################################################calculate.py
from itertools import combinations
import numpy as np
from pathlib import Path
def isnan_func(lis: list):
forarrin lis:
ifnp.isnan(arr).any() or np.isinf(arr).any():
return True
return False
class Comparator(object):
def __init__(self):
super().__init__()
self.__log_dir = Path(".")
def compare(self, backends_outputs: dict, backends_losses: dict,
backends_loss_grads: dict, backends_grads: dict, ok_backends: list):
forbkin ok_backends:
outputs1 = backends_outputs.get(bk, None)
loss1 = backends_losses.get(bk, None)
loss_grads1 = backends_loss_grads.get(bk, None)
grads1 = backends_grads.get(bk, None)
print(f"bk:{bk}")
of1 = isnan_func(list(outputs1.values()))
lgf1 = isnan_func(list(loss_grads1.values()))
gf1 = isnan_func(list(grads1.values()))
ls1=np.isnan(loss1).any()
print(f"output1:{of1}")
print(f"loss1:{ls1}")
print(f"loss_grads1:{lgf1}")
print(f"grads1:{gf1}")
if of1 or lgf1 or gf1 or ls1:
return True
return False
You can run the 'trainer.py' to reproduce the bug
Relevant log output
No response
The text was updated successfully, but these errors were encountered:
It looks like you are using an older Version of Tensorflow (2.0). Many bugs have been fixed in the latest version. Can you please execute your code using Latest Version (2.15.0 or 2.16.1) and let us know if the issue still persists?
Thank you!
It looks like you are using an older Version of Tensorflow (2.0). Many bugs have been fixed in the latest version. Can you please execute your code using Latest Version (2.15.0 or 2.16.1) and let us know if the issue still persists?
Thank you!
Hello,I have updated the Version of Tensorflow to 2.15.0,but I got an ERROR below: [ERROR] Crash when training model with tensorflow Traceback (most recent call last): File "train.py", line 254, in <module> model, input_objects_names, output_layers_names, x, y, ins, ins_value = __prepare(flags.loss, flags.optimizer, File "train.py", line 69, in __prepare ins = model._feed_inputs + model._feed_targets + model._feed_sample_weights AttributeError: 'Functional' object has no attribute '_feed_targets'
Could you help me solve the problem?
Issue type
Bug
Have you reproduced the bug with TensorFlow Nightly?
No
Source
source
TensorFlow version
2.0.0
Custom code
Yes
OS platform and distribution
Ubuntu 20.04.4 LTS
Mobile device
No response
Python version
3.6.5
Bazel version
No response
GCC/compiler version
No response
CUDA/cuDNN version
CUDA Version: 12.2
GPU model and memory
No response
Current behavior?
When I tried to train model with the Poisson loss function and the Adam optimizer,the loss value of the model became NaN.
The model file and test data are attached here.In order to simplify the process of identifying the bug, I have only included a portion of the code for the forward and backward propagation.
000116.zip
Standalone code to reproduce the issue
Relevant log output
No response
The text was updated successfully, but these errors were encountered: