[go: nahoru, domu]

Skip to content

Commit

Permalink
Add profiling scope.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 505133942
  • Loading branch information
danielecook authored and Copybara-Service committed Jan 27, 2023
1 parent 5c09997 commit d482ca0
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 36 deletions.
38 changes: 20 additions & 18 deletions deepconsensus/models/model_distillation.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,31 +332,33 @@ def distributed_eval_step(iterator):
logging.info('Starting to run epoch: %s', epoch)
train_time_start = datetime.datetime.now()
for step_train in range(initial_step_train, steps_per_epoch):
distributed_train_step(train_iterator)
# Log and reset train metrics.
if optimizer.iterations % log_train_steps == 0:
train_time_end = datetime.datetime.now()
train_steps_per_second = log_train_steps / (
train_time_end - train_time_start).total_seconds()
with train_writer.as_default():
model_utils.log_and_save_metrics(
epoch=epoch,
num_epochs=params['num_epochs'],
step=step_train,
total_steps=steps_per_epoch,
optimizer=optimizer,
metrics=[train_loss] + train_metrics,
training=True,
steps_per_second=train_steps_per_second)
train_time_start = datetime.datetime.now()
with tf.profiler.experimental.Trace('train', step_num=step_train, _r=1):
distributed_train_step(train_iterator)
# Log and reset train metrics.
if optimizer.iterations % log_train_steps == 0:
train_time_end = datetime.datetime.now()
train_steps_per_second = log_train_steps / (
train_time_end - train_time_start).total_seconds()
with train_writer.as_default():
model_utils.log_and_save_metrics(
epoch=epoch,
num_epochs=params['num_epochs'],
step=step_train,
total_steps=steps_per_epoch,
optimizer=optimizer,
metrics=[train_loss] + train_metrics,
training=True,
steps_per_second=train_steps_per_second)
train_time_start = datetime.datetime.now()
# Log eval metrics, save checkpoint, and reset eval metrics every
# log_eval_steps and at the end of training.
if (optimizer.iterations % log_eval_steps == 0) or (optimizer.iterations
== total_train_steps):
# Run evalution on the whole eval dataset and collect metrics.
eval_time_start = datetime.datetime.now()
for step_eval in range(steps_per_eval):
distributed_eval_step(eval_iterator)
with tf.profiler.experimental.Trace('eval', step_num=step_eval, _r=1):
distributed_eval_step(eval_iterator)
eval_time_end = datetime.datetime.now()
eval_steps_per_second = steps_per_eval / (
eval_time_end - eval_time_start).total_seconds()
Expand Down
38 changes: 20 additions & 18 deletions deepconsensus/models/model_train_custom_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,31 +206,33 @@ def distributed_eval_step(iterator):
logging.info('Starting to run epoch: %s', epoch)
train_time_start = datetime.datetime.now()
for step_train in range(initial_step_train, steps_per_epoch):
distributed_train_step(train_iterator)
# Log and reset train metrics.
if optimizer.iterations % log_train_steps == 0:
train_time_end = datetime.datetime.now()
train_steps_per_second = log_train_steps / (
train_time_end - train_time_start).total_seconds()
with train_writer.as_default():
model_utils.log_and_save_metrics(
epoch=epoch,
num_epochs=params['num_epochs'],
step=step_train,
total_steps=steps_per_epoch,
optimizer=optimizer,
metrics=[train_loss] + train_metrics,
training=True,
steps_per_second=train_steps_per_second)
train_time_start = datetime.datetime.now()
with tf.profiler.experimental.Trace('train', step_num=step_train, _r=1):
distributed_train_step(train_iterator)
# Log and reset train metrics.
if optimizer.iterations % log_train_steps == 0:
train_time_end = datetime.datetime.now()
train_steps_per_second = log_train_steps / (
train_time_end - train_time_start).total_seconds()
with train_writer.as_default():
model_utils.log_and_save_metrics(
epoch=epoch,
num_epochs=params['num_epochs'],
step=step_train,
total_steps=steps_per_epoch,
optimizer=optimizer,
metrics=[train_loss] + train_metrics,
training=True,
steps_per_second=train_steps_per_second)
train_time_start = datetime.datetime.now()
# Log eval metrics, save checkpoint, and reset eval metrics every
# log_eval_steps and at the end of training.
if (optimizer.iterations % log_eval_steps == 0) or (optimizer.iterations
== total_train_steps):
# Run evalution on the whole eval dataset and collect metrics.
eval_time_start = datetime.datetime.now()
for step_eval in range(1, steps_per_eval + 1):
distributed_eval_step(eval_iterator)
with tf.profiler.experimental.Trace('eval', step_num=step_eval, _r=1):
distributed_eval_step(eval_iterator)
eval_time_end = datetime.datetime.now()
eval_steps_per_second = steps_per_eval / (
eval_time_end - eval_time_start).total_seconds()
Expand Down

0 comments on commit d482ca0

Please sign in to comment.