[go: nahoru, domu]

Skip to content

Commit

Permalink
Change logging of losses.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 463644815
  • Loading branch information
anastasiyabl authored and Copybara-Service committed Jul 27, 2022
1 parent 0609176 commit 5ba66da
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 72 deletions.
70 changes: 40 additions & 30 deletions deepconsensus/models/model_distillation.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,47 +282,57 @@ def distributed_eval_step(iterator):
axis=None)
return reduced_eval_losses_dict

log_steps = 100
log_train_steps = 100
log_eval_steps = 3000
if FLAGS.eval_and_log_every_step:
log_steps = 1
log_train_steps = 1
train_iterator = iter(train_dataset)
eval_iterator = iter(eval_dataset)
min_eval_loss = 1e6
total_train_steps = steps_per_epoch * params['num_epochs']
logging.info('Total training steps = %s', total_train_steps)

for epoch in range(initial_epoch, params['num_epochs']):
logging.info('Starting to run epoch: %s', epoch)
with train_writer.as_default():
for step in range(steps_per_epoch):
reduced_train_losses = distributed_train_step(train_iterator)
if step % log_steps == 0:
for step_train in range(steps_per_epoch):
distributed_train_step(train_iterator)
# Log and reset train metrics.
if optimizer.iterations % log_train_steps == 0:
with train_writer.as_default():
model_utils.log_and_save_metrics(
epoch=epoch,
step=step,
step=step_train,
total_steps=steps_per_epoch,
optimizer=optimizer,
losses_dict=reduced_train_losses,
metrics=train_metrics,
metrics=[train_loss] + train_metrics,
training=True)
with eval_writer.as_default():
for step in range(steps_per_eval):
reduced_eval_losses = distributed_eval_step(eval_iterator)
model_utils.log_and_save_metrics(
epoch=epoch,
step=step,
total_steps=steps_per_eval,
optimizer=optimizer,
losses_dict=reduced_eval_losses,
metrics=eval_metrics,
training=False)
checkpoint_name = model_utils.save_checkpoint(checkpoint, out_dir,
train_metrics, eval_metrics,
write_checkpoint_metrics)
if min_eval_loss > float(eval_loss.result()):
min_eval_loss = float(eval_loss.result())
with tf.io.gfile.GFile(os.path.join(out_dir, 'best_checkpoint.txt'),
'w') as f:
f.write(os.path.basename(checkpoint_name))
model_utils.reset_all_metrics([train_loss, eval_loss] + train_metrics +
eval_metrics)
# Log eval metrics, save checkpoint, and reset eval metrics every
# log_eval_steps and at the end of training.
if (optimizer.iterations % log_eval_steps == 0) or (optimizer.iterations
== total_train_steps):
# Run evalution on the whole eval dataset and collect metrics.
for step_eval in range(steps_per_eval):
distributed_eval_step(eval_iterator)
# Save checkpoint.
checkpoint_name = model_utils.save_checkpoint(
checkpoint, out_dir, [eval_loss] + eval_metrics,
write_checkpoint_metrics)
# Record the best checkpoint.
if min_eval_loss > float(eval_loss.result()):
min_eval_loss = float(eval_loss.result())
with tf.io.gfile.GFile(
os.path.join(out_dir, 'best_checkpoint.txt'), 'w') as f:
f.write(os.path.basename(checkpoint_name))
# Log metrics on the eval set, this must be done at the end since
# log_and_save_metrics will reset the eval metrics values.
with eval_writer.as_default():
model_utils.log_and_save_metrics(
epoch=epoch,
step=step_eval,
total_steps=steps_per_eval,
optimizer=optimizer,
metrics=[eval_loss] + eval_metrics,
training=False)


def train(teacher_model_dir: str,
Expand Down
72 changes: 40 additions & 32 deletions deepconsensus/models/model_train_custom_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,49 +146,57 @@ def distributed_eval_step(iterator):
return strategy.reduce(
tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)

log_steps = 100
log_train_steps = 100
log_eval_steps = 3000
if FLAGS.eval_and_log_every_step:
log_steps = 1
log_train_steps = 1
train_iterator = iter(train_dataset)
eval_iterator = iter(eval_dataset)
min_eval_loss = 1e6
total_train_steps = steps_per_epoch * params['num_epochs']
logging.info('Total training steps = %s', total_train_steps)

for epoch in range(initial_epoch, params['num_epochs']):
logging.info('Starting to run epoch: %s', epoch)
with train_writer.as_default():
for step in range(steps_per_epoch):
reduced_train_loss = distributed_train_step(train_iterator)
if step % log_steps == 0:
train_loss_dict = {train_loss.name: reduced_train_loss}
for step_train in range(1, steps_per_epoch + 1):
distributed_train_step(train_iterator)
# Log and reset train metrics.
if optimizer.iterations % log_train_steps == 0:
with train_writer.as_default():
model_utils.log_and_save_metrics(
epoch=epoch,
step=step,
step=step_train,
total_steps=steps_per_epoch,
optimizer=optimizer,
losses_dict=train_loss_dict,
metrics=train_metrics,
metrics=[train_loss] + train_metrics,
training=True)
with eval_writer.as_default():
for step in range(steps_per_eval):
reduced_eval_loss = distributed_eval_step(eval_iterator)
eval_loss_dict = {eval_loss.name: reduced_eval_loss}
model_utils.log_and_save_metrics(
epoch=epoch,
step=step,
total_steps=steps_per_eval,
optimizer=optimizer,
losses_dict=eval_loss_dict,
metrics=eval_metrics,
training=False)
checkpoint_name = model_utils.save_checkpoint(checkpoint, out_dir,
train_metrics, eval_metrics,
write_checkpoint_metrics)
if min_eval_loss > float(eval_loss.result()):
min_eval_loss = float(eval_loss.result())
with tf.io.gfile.GFile(os.path.join(out_dir, 'best_checkpoint.txt'),
'w') as f:
f.write(os.path.basename(checkpoint_name))
model_utils.reset_all_metrics([train_loss, eval_loss] + train_metrics +
eval_metrics)
# Log eval metrics, save checkpoint, and reset eval metrics every
# log_eval_steps and at the end of training.
if (optimizer.iterations % log_eval_steps == 0) or (optimizer.iterations
== total_train_steps):
# Run evalution on the whole eval dataset and collect metrics.
for step_eval in range(1, steps_per_eval + 1):
distributed_eval_step(eval_iterator)
# Save checkpoint.
checkpoint_name = model_utils.save_checkpoint(
checkpoint, out_dir, [eval_loss] + eval_metrics,
write_checkpoint_metrics)
# Record the best checkpoint.
if min_eval_loss > float(eval_loss.result()):
min_eval_loss = float(eval_loss.result())
with tf.io.gfile.GFile(
os.path.join(out_dir, 'best_checkpoint.txt'), 'w') as f:
f.write(os.path.basename(checkpoint_name))
# Log metrics on the eval set, this must be done at the end since
# log_and_save_metrics will reset the eval metrics values.
with eval_writer.as_default():
model_utils.log_and_save_metrics(
epoch=epoch,
step=step_eval,
total_steps=steps_per_eval,
optimizer=optimizer,
metrics=[eval_loss] + eval_metrics,
training=False)


def train(out_dir: str,
Expand Down
14 changes: 4 additions & 10 deletions deepconsensus/models/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import json
import logging
import os
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, List, Optional, Tuple, Union

import ml_collections
import numpy as np
Expand Down Expand Up @@ -395,22 +395,18 @@ def reset_all_metrics(metrics: List[tf.keras.metrics.Metric]) -> None:

def log_and_save_metrics(epoch: int, step: int, total_steps: int,
optimizer: tf.keras.optimizers.Optimizer,
losses_dict: Dict[str, float],
metrics: List[tf.keras.metrics.Metric],
training: bool) -> None:
"""Logs metrics and saves them for TensorBoard."""
logging.info(
'epoch: %d step: %d of %d metrics: %s', epoch, step, total_steps,
' '.join(f'{loss_name}= {losses_dict[loss_name]}'
for loss_name in losses_dict.keys()))
' '.join(f'{metric.name}= {metric.result()}' for metric in metrics))

if training:
tf.summary.scalar('learning_rate', optimizer.lr, step=optimizer.iterations)
for loss_name in losses_dict.keys():
tf.summary.scalar(
loss_name, losses_dict[loss_name], step=optimizer.iterations)
for metric in metrics:
tf.summary.scalar(metric.name, metric.result(), step=optimizer.iterations)
metric.reset_states()


def write_row(handle: Union[io.TextIOWrapper], row: List[Any]) -> None:
Expand All @@ -419,7 +415,6 @@ def write_row(handle: Union[io.TextIOWrapper], row: List[Any]) -> None:


def save_checkpoint(checkpoint: tf.train.Checkpoint, out_dir: str,
train_metrics: List[tf.keras.metrics.Metric],
eval_metrics: List[tf.keras.metrics.Metric],
write_checkpoint_metrics: bool) -> str:
"""Save checkpoint and return its name."""
Expand All @@ -434,8 +429,7 @@ def save_checkpoint(checkpoint: tf.train.Checkpoint, out_dir: str,
write_row(f, row)

with tf.io.gfile.GFile(metrics_file, 'a') as f:
for group_name, metrics in [('train', train_metrics),
('eval', eval_metrics)]:
for group_name, metrics in [('eval', eval_metrics)]:
for metric in metrics:
row = [
checkpoint_name, group_name, metric.name,
Expand Down

0 comments on commit 5ba66da

Please sign in to comment.