From f25f44efeb0d544411eb863d141ac51bfcc3767d Mon Sep 17 00:00:00 2001
From: belyaeva <belyaeva@google.com>
Date: Wed, 29 Jun 2022 11:51:02 -0700
Subject: [PATCH] Update tf version to >= 2.9 and remove remaining dependencies
 on tensorflow_models.official.legacy.transformer

PiperOrigin-RevId: 458027974
---
 deepconsensus/models/legacy_networks.py       | 393 ------------------
 deepconsensus/models/legacy_networks_test.py  | 118 ------
 .../models/losses_and_metrics_test.py         |   2 +-
 deepconsensus/models/model_configs.py         |  78 +---
 deepconsensus/models/model_distillation.py    |   4 +-
 .../models/model_distillation_test.py         |   2 +-
 deepconsensus/models/model_inference_test.py  |   2 +-
 .../models/model_train_custom_loop.py         |   2 +-
 deepconsensus/models/model_utils.py           |  39 +-
 deepconsensus/models/model_utils_test.py      |   2 +-
 deepconsensus/models/networks_test.py         |   4 +-
 .../models/transformer_basic_params.py        | 109 +++++
 deepconsensus/testdata/README.md              |   2 +-
 deepconsensus/testdata/model/params.json      |   4 +-
 install-gpu.sh                                |   2 +-
 install.sh                                    |   2 +-
 requirements.txt                              |  11 +-
 setup.py                                      |   4 +-
 18 files changed, 168 insertions(+), 612 deletions(-)
 delete mode 100644 deepconsensus/models/legacy_networks.py
 delete mode 100644 deepconsensus/models/legacy_networks_test.py
 create mode 100644 deepconsensus/models/transformer_basic_params.py

diff --git a/deepconsensus/models/legacy_networks.py b/deepconsensus/models/legacy_networks.py
deleted file mode 100644
index 2e9c358..0000000
--- a/deepconsensus/models/legacy_networks.py
+++ /dev/null
@@ -1,393 +0,0 @@
-# Copyright (c) 2021, Google Inc.
-# All rights reserved.
-# 
-# Redistribution and use in source and binary forms, with or without modification,
-# are permitted provided that the following conditions are met:
-# 
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-# 
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-# 
-# 3. Neither the name of Google Inc. nor the names of its contributors
-#    may be used to endorse or promote products derived from this software without
-#    specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""TF2 + tf.keras implementations of legacy_networks for DeepConsensus."""
-
-import logging
-from typing import Callable, Optional, Tuple
-
-import ml_collections
-import tensorflow as tf
-
-from deepconsensus.models import data_providers
-from official.nlp.transformer import embedding_layer
-from official.nlp.transformer import model_utils
-from official.nlp.transformer import transformer
-from official.nlp import modeling
-from official.nlp.bert import bert_models
-from official.nlp.bert import configs
-
-
-class EmbeddingSharedWeights(embedding_layer.EmbeddingSharedWeights):
-
-  def call(self, inputs):
-    # make sure 0 ids match to zero emebeddings.
-    embeddings = super().call(inputs)
-    mask = tf.cast(tf.not_equal(inputs, 0), embeddings.dtype)
-    embeddings *= tf.expand_dims(mask, -1)
-    return embeddings
-
-
-# pylint: disable=invalid-name
-def FullyConnectedNet(params: ml_collections.ConfigDict) -> tf.keras.Model:
-  """Fully connected neural network architecture."""
-
-  inputs = tf.keras.Input(
-      shape=(params.hidden_size, params.max_length, params.num_channels))
-  l2_reg = tf.keras.regularizers.l2
-  net = inputs
-  net = tf.keras.layers.Flatten()(net)
-  for i in range(len(params.fc_size)):
-    net = tf.keras.layers.Dense(
-        units=params.fc_size[i],
-        activation=tf.nn.relu,
-        kernel_regularizer=l2_reg(params.l2))(
-            net)
-    net = tf.keras.layers.Dropout(rate=params.fc_dropout)(net)
-
-  net = tf.keras.layers.Dense(units=params.max_length * params.num_classes)(net)
-  net = tf.keras.layers.Reshape((params.max_length, params.num_classes))(net)
-  net = tf.keras.layers.Softmax(axis=-1)(net)
-  outputs = net
-  return tf.keras.Model(inputs=inputs, outputs=outputs)
-
-
-def get_conv_sub_model(
-    conv_model
-) -> Tuple[Callable[..., tf.Tensor], Callable[[tf.keras.Model],
-                                              tf.keras.Model]]:
-  """Returns a predefined convolutional architecture."""
-  if conv_model == 'resnet50':
-    return tf.keras.applications.ResNet50V2, tf.keras.applications.resnet_v2.preprocess_input
-  elif conv_model == 'resnet101':
-    return tf.keras.applications.ResNet101V2, tf.keras.applications.resnet_v2.preprocess_input
-  elif conv_model == 'resnet152':
-    return tf.keras.applications.ResNet152V2, tf.keras.applications.resnet_v2.preprocess_input
-  else:
-    raise NotImplementedError(f'conv model "{conv_model}" not found')
-
-
-# pylint: disable=invalid-name
-class ConvNet(tf.keras.Model):
-  """Convolutional neural network architecture."""
-
-  def __init__(self, params: ml_collections.ConfigDict, **kwargs):
-    super(ConvNet, self).__init__(params, **kwargs)
-    # Most conv models only accept 3 channels.
-    self.resnet_input_shape = (params.hidden_size, params.max_length, 3)
-    self.dimensions = params.max_length * params.num_classes
-
-    model, self.conv_preprocess = get_conv_sub_model(params.conv_model)
-    self.model = model(
-        include_top=False,
-        weights=None,
-        input_shape=self.resnet_input_shape,
-        pooling='avg')
-    self.use_sn = params.use_sn
-    self.max_length = params.max_length
-    self.num_classes = params.num_classes
-
-    # Define layers
-    self.layer_dense = tf.keras.layers.Dense(units=self.dimensions)
-
-  def call(self, inputs: tf.Tensor, training: bool) -> tf.Tensor:
-    # Most conv models only accept 3 channels;
-    # The sn channel must be removed and optionally
-    # added back at the end. CCS rows not being used currently for this model.
-    input_rows, _, sn_rows = tf.split(inputs, [3, 1, 1], 3)
-
-    cn_input = self.conv_preprocess(input_rows)
-    net = self.model(cn_input, training=training)
-
-    if self.use_sn:
-      logging.info('Using SN Values')
-      # sn_rows was padded previously to match the input dimensions
-      # Crop it here back to 4 rows.
-      sn_rows = tf.image.crop_to_bounding_box(sn_rows, 0, 0, 4, self.max_length)
-      sn_rows = tf.keras.layers.Flatten()(sn_rows)
-      net = tf.keras.layers.Flatten()(net)
-      net = tf.concat([net, sn_rows], 1)
-    else:
-      net = tf.keras.layers.Flatten()(net)
-
-    net = self.layer_dense(net)
-    net = tf.keras.layers.Reshape((self.max_length, self.num_classes))(net)
-    net = tf.keras.layers.Softmax(axis=-1)(net)
-    output = net
-    return output
-
-
-class EncoderOnlyTransformer(transformer.Transformer):
-  """Modified encoder-only transformer model for DeepConsensus.
-
-  This implementation extends the one in
-  //third_party/tensorflow_models/official/legacy/transformer/transformer.py.
-  The main changes are:
-
-  * Removing logic relating to converting tokens to embeddings, since the
-  DeepConsensus is already in the form of vectors for each position.
-
-  * Removing the decoder, since we only want to run the encoder.
-
-  * Adding additional layers on top of the encoder for the per-position
-  classification task.
-  """
-
-  def __init__(self,
-               params: ml_collections.ConfigDict,
-               name: Optional[str] = None):
-    # Call grandparent super since we don't want to initialize embeddings.
-    super(transformer.Transformer, self).__init__(params, name=name)
-    self.params = params
-    if self.params.add_pos_encoding and self.params.use_relative_pos_enc:
-      self.position_embedding = modeling.layers.position_embedding.RelativePositionEmbedding(
-          hidden_size=self.params['hidden_size'])
-    self.encoder_stack = transformer.EncoderStack(params)
-    self.fc1 = tf.keras.layers.Dense(
-        units=(params['vocab_size']),
-        activation=None,
-        use_bias=True,
-        kernel_initializer='glorot_uniform',
-        bias_initializer='zeros')
-    self.softmax = tf.keras.layers.Softmax()
-
-  def call(self, inputs: tf.Tensor, training: bool) -> tf.Tensor:
-    """Runs a forward pass of the model.
-
-    Args:
-      inputs: tensor of shape (batch_size, hidden_size, input_length
-        num_channels).
-      training: boolean, whether in training mode or not.
-
-    Returns:
-      Output from softmax layer, which is a distribution over the vocabular at
-      each position in the sequence.
-    """
-
-    with tf.name_scope('Transformer'):
-
-      # Get rid of the channel dimension as we only have one channel.
-      inputs = tf.squeeze(inputs, -1)
-
-      # `inputs` is of shape (batch_size, hidden_size, input_length). For the
-      # Transformer, we need to change the format to be the following:
-      # (batch_size, input_length, hidden_size).
-      inputs = tf.transpose(inputs, [0, 2, 1])
-
-      # Attention_bias for our model should be all 0s with shape
-      # (batch_size, 1, 1, input_length). See model_utils.get_padding_bias
-      # to see how this is calculated in the base model.
-      all_zeros = tf.reduce_sum(tf.zeros_like(inputs), -1)
-      attention_bias = tf.expand_dims(tf.expand_dims(all_zeros, 1), 1)
-
-      # Run the inputs through the encoder. Encoder returns the softmax output.
-      encoder_outputs = self.encode(inputs, attention_bias, training)
-      logits = encoder_outputs
-      return logits
-
-  def encode(self, inputs: tf.Tensor, attention_bias: tf.Tensor,
-             training: bool) -> tf.Tensor:
-    """Runs the input through Encoder stack and problem-specific layers."""
-
-    with tf.name_scope('encode'):
-
-      # The input for each position is already a vector, so we do not use
-      # embeddings here, unlike the base model. Base model input is a token at
-      # each position, which must first be embedded as a vector. In the future,
-      # we may want to use embeddings for part of the input, such as the bases,
-      # so that we can learn the scale of values.
-      encoder_inputs = inputs
-
-      # Positional embedding only works when we have an even value for the
-      # hidden_size. If hidden_size is odd, add an empty row to make it even.
-      if self.params.add_pos_encoding and encoder_inputs.shape[2] % 2 != 0:
-        empty_row = tf.zeros(
-            shape=(encoder_inputs.shape[0], encoder_inputs.shape[1], 1))
-        encoder_inputs = tf.concat([encoder_inputs, empty_row], axis=-1)
-        assert self.params.hidden_size == encoder_inputs.shape[2]
-
-      # All values in `input_padding` should be 0 and shape should be
-      # (batch_size, input_length). See model_utils.get_padding to see how this
-      # is computed for the base model.
-      inputs_padding = tf.reduce_sum(tf.zeros_like(encoder_inputs), -1)
-
-      # Cast input `attention_bias` to correct type, as done in the base model.
-      attention_bias = tf.cast(attention_bias, self.params['dtype'])
-
-      # Add positional encoding to the input. The scale of the positional
-      # encoding relative to the input values will matter since we are not
-      # learning the input embedding.
-      if self.params['add_pos_encoding']:
-        with tf.name_scope('add_pos_encoding'):
-          if self.params['use_relative_pos_enc']:
-            pos_encoding = self.position_embedding(inputs=encoder_inputs)
-          else:
-            pos_encoding = model_utils.get_position_encoding(
-                self.params['max_length'], self.params['hidden_size'])
-          pos_encoding = tf.cast(pos_encoding, self.params['dtype'])
-          encoder_inputs += pos_encoding
-
-      # Add dropout when training.
-      if training:
-        encoder_inputs = tf.nn.dropout(
-            encoder_inputs, rate=self.params['layer_postprocess_dropout'])
-
-      # Pass inputs through the encoder. As mentioned above, `inputs_padding` is
-      # not actually used by EncoderStack.call. Encoder stack output has shape
-      # (batch_size, input_length, hidden_size).
-      encoder_outputs = self.encoder_stack(
-          encoder_inputs, attention_bias, inputs_padding, training=training)
-
-      # Pass through dense layer, and output a distribution.
-      encoder_outputs = self.fc1(encoder_outputs)
-      encoder_outputs = self.softmax(encoder_outputs)
-      return encoder_outputs
-
-  def decode(self, encoder_outputs: tf.Tensor, attention_bias: tf.Tensor,
-             training: bool) -> tf.Tensor:
-    """Returns the outputs from the encoder."""
-
-    raise NotImplementedError
-
-  def predict(self, encoder_inputs: tf.Tensor) -> tf.Tensor:
-    """Returns the argmax of the decoder output, which comes from a softmax."""
-
-    # The base model also has a predict method that behaves differently. This
-    # predict function is consistent with how predict behaves for other
-    # DeepConsensus models (conv, FC), but we may want to change this in the
-    # future to match the transformer base class. For more details, see:
-    # https://github.com/tensorflow/models/blob/bc71d8e9e155d34a38af8489ad4cbb2fde6fa152/official/nlp/transformer/transformer.py#L279
-    return self.call(encoder_inputs, training=False)
-
-
-class EncoderOnlyLearnedValuesTransformer(EncoderOnlyTransformer):
-  """Modified transformer that learns embeddings for the bases."""
-
-  def __init__(self,
-               params: ml_collections.ConfigDict,
-               name: Optional[str] = None):
-    super(EncoderOnlyLearnedValuesTransformer, self).__init__(params, name=name)
-    if params.use_bases:
-      self.bases_embedding_layer = EmbeddingSharedWeights(
-          params['vocab_size'], params['per_base_hidden_size'])
-    if params.use_pw:
-      pw_vocab_size = params.PW_MAX + 1
-      self.pw_embedding_layer = EmbeddingSharedWeights(pw_vocab_size,
-                                                       params['pw_hidden_size'])
-    if params.use_ip:
-      ip_vocab_size = params.IP_MAX + 1
-      self.ip_embedding_layer = EmbeddingSharedWeights(ip_vocab_size,
-                                                       params['ip_hidden_size'])
-
-
-    if params.use_sn:
-      sn_vocab_size = params.SN_MAX + 1
-      self.sn_embedding_layer = EmbeddingSharedWeights(sn_vocab_size,
-                                                       params['sn_hidden_size'])
-
-    if params.use_strand:
-      strand_vocab_size = params.STRAND_MAX + 1
-      self.strand_embedding_layer = EmbeddingSharedWeights(
-          strand_vocab_size, params['strand_hidden_size'])
-
-    # Define a dense layer to linearly map the concatenated embeddings of
-    # all subreads at a given position to a smaller dimension
-    # (transformer_input_size) in order to keep the transformer layers small.
-    if self.params.condense_transformer_input:
-      logging.info('Condensing input.')
-      self.transformer_input_condenser = tf.keras.layers.Dense(
-          units=(params.transformer_input_size),
-          activation=None,
-          use_bias=False,
-          kernel_initializer='glorot_uniform',
-          bias_initializer='zeros')
-
-  def encode(self, inputs: tf.Tensor, attention_bias: tf.Tensor,
-             training: bool) -> tf.Tensor:
-    """Runs the input through Encoder stack and problem-specific layers."""
-
-    # Input to embedding layer is [batch_size, length] and output will be
-    # [batch_size, length, embedding_size]. Embed each row of the input
-    # separately and then concatenate.
-    embedded_inputs = []
-    base_indices, pw_indices, ip_indices, strand_indices, ccs_indices, sn_indices = data_providers.get_indices(
-        self.params['max_passes'])
-    if self.params.use_bases:
-      for i in range(*base_indices):
-        # Shape: [batch_size, length, per_base_hidden_size]
-        embedded = self.bases_embedding_layer(
-            tf.cast(inputs[:, :, i], tf.int32))
-        embedded_inputs.append(embedded)
-
-
-    if self.params.use_pw:
-      for i in range(*pw_indices):
-        # Shape: [batch_size, length, pw_hidden_size]
-        embedded = self.pw_embedding_layer(tf.cast(inputs[:, :, i], tf.int32))
-        embedded_inputs.append(embedded)
-
-    if self.params.use_ip:
-      for i in range(*ip_indices):
-        # Shape: [batch_size, length, ip_hidden_size]
-        embedded = self.ip_embedding_layer(tf.cast(inputs[:, :, i], tf.int32))
-        embedded_inputs.append(embedded)
-
-    if self.params.use_strand:
-      for i in range(*strand_indices):
-        embedded = self.strand_embedding_layer(
-            tf.cast(inputs[:, :, i], tf.int32))
-        embedded_inputs.append(embedded)
-
-    if self.params.use_ccs:
-      for i in range(*ccs_indices):
-        embedded = self.bases_embedding_layer(
-            tf.cast(inputs[:, :, i], tf.int32))
-        embedded_inputs.append(embedded)
-
-    # TODO: experiment with computing a weighted average using snr as
-    # weights to aggregate subread-level embeddings (instead of concatenating).
-    if self.params.use_sn:
-      # The last four elements in the last dimension in the inputs tensor
-      # correspond to the four signal-to-noise ratio scores for A, G, C, T.
-      for i in range(*sn_indices):
-        embedded = self.sn_embedding_layer(tf.cast(inputs[:, :, i], tf.int32))
-        embedded_inputs.append(embedded)
-
-    embedded_inputs = tf.concat(embedded_inputs, axis=-1)
-    embedded_inputs = tf.cast(embedded_inputs, self.params['dtype'])
-
-    if self.params.condense_transformer_input:
-      # Condense the transformer input at each position to a smaller vector to
-      # reduce the transformer hidden size, since the transformer model size is
-      # quadratic in its hidden size.
-      # Shape: [batch_size, length, transformer_input_size]
-      transformer_input = self.transformer_input_condenser(embedded_inputs)
-    else:
-      transformer_input = embedded_inputs
-
-    return super(EncoderOnlyLearnedValuesTransformer,
-                 self).encode(transformer_input, attention_bias, training)
diff --git a/deepconsensus/models/legacy_networks_test.py b/deepconsensus/models/legacy_networks_test.py
deleted file mode 100644
index e92fd2c..0000000
--- a/deepconsensus/models/legacy_networks_test.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) 2021, Google Inc.
-# All rights reserved.
-# 
-# Redistribution and use in source and binary forms, with or without modification,
-# are permitted provided that the following conditions are met:
-# 
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-# 
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-# 
-# 3. Neither the name of Google Inc. nor the names of its contributors
-#    may be used to endorse or promote products derived from this software without
-#    specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""Tests for deepconsensus.models.legacy_networks."""
-
-import itertools
-
-from absl.testing import absltest
-from absl.testing import parameterized
-
-import ml_collections
-import numpy as np
-import tensorflow as tf
-
-from deepconsensus.models import data_providers
-from deepconsensus.models import model_configs
-from deepconsensus.models import model_utils
-
-
-def get_tf_example_rows(params: ml_collections.ConfigDict,
-                        inference: bool) -> np.ndarray:
-  """Returns one example from the training dataset for given params."""
-  dataset = data_providers.get_dataset(
-      file_pattern=params.train_path,
-      num_epochs=params.num_epochs,
-      batch_size=params.batch_size,
-      params=params,
-      inference=inference)
-  tf_example = next(dataset.as_numpy_iterator())
-  return tf_example['rows']
-
-
-class ModelsTest(parameterized.TestCase):
-
-  @parameterized.parameters(
-      itertools.product(
-          [True, False],
-          [
-              'fc+test',
-              'transformer+test',
-              'transformer_learn_values+test',
-          ],
-          [True, False]))
-  def test_outputs(self, training, config_name, use_predict):
-    """Checks that softmax distribution and final predictions are valid.
-
-    This test is only checking the output format and does not train the model.
-    Args:
-      training: whether we are in training or eval/test mode.
-      config_name: config to test.
-      use_predict: whether to use model.predict or call model as a function.
-    """
-    params = model_configs.get_config(config_name)
-    model_utils.modify_params(params)
-    model = model_utils.get_model(params)
-    inference = not training
-    rows = get_tf_example_rows(params, inference=inference)
-    if use_predict:
-      softmax_output = model.predict(rows)
-    else:
-      softmax_output = model(rows, training=training).numpy()
-    predictions = tf.argmax(softmax_output, -1)
-
-    # First dimension will always be equal to batch_size because test config
-    # uses a batch size of 1.
-    self.assertEqual(softmax_output.shape,
-                     (params.batch_size, params.max_length, params.num_classes))
-    self.assertTrue(
-        np.allclose(
-            np.sum(softmax_output, axis=-1),
-            np.ones(shape=[params.batch_size, params.max_length])))
-    self.assertEqual(predictions.shape, (params.batch_size, params.max_length))
-
-  @parameterized.parameters(
-      itertools.product(
-          [
-              'fc+test',
-              'transformer+test',
-              'transformer_learn_values+test',
-          ],
-          [True, False]))
-  def test_predict_and_model_fn_equal(self, config_name, inference):
-    """Checks that model.predict and calling model as a function are equal."""
-    config = model_configs.get_config(config_name)
-    model_utils.modify_params(config)
-    model = model_utils.get_model(config)
-    rows = get_tf_example_rows(config, inference=inference)
-    softmax_output_predict = model.predict(rows)
-    softmax_output = model(rows, training=False).numpy()
-    self.assertTrue(
-        np.allclose(softmax_output_predict, softmax_output, rtol=1e-05))
-
-if __name__ == '__main__':
-  absltest.main()
diff --git a/deepconsensus/models/losses_and_metrics_test.py b/deepconsensus/models/losses_and_metrics_test.py
index efe9868..a626bb7 100644
--- a/deepconsensus/models/losses_and_metrics_test.py
+++ b/deepconsensus/models/losses_and_metrics_test.py
@@ -614,7 +614,7 @@ def test_distillation_loss_fn(self, batch_size, window_length, temperature,
         distill_loss = distill_loss + kl_ij
       # Get the distillation loss over the whole window.
       distill_loss = distill_loss / window_length
-      self.assertAlmostEqual(distill_loss, expected_loss[example_ind])
+      self.assertAlmostEqual(distill_loss, expected_loss[example_ind], places=6)
 
 
 if __name__ == '__main__':
diff --git a/deepconsensus/models/model_configs.py b/deepconsensus/models/model_configs.py
index 0b175d9..590d0c4 100644
--- a/deepconsensus/models/model_configs.py
+++ b/deepconsensus/models/model_configs.py
@@ -65,46 +65,11 @@ def _set_base_fc_hparams(params):
   params.buffer_size = 1000
 
 
-def _set_base_transformer_v2_hparams(params):
-  """Updates given config with base values for the Transformer model."""
-  # Architecture
-  params.model_name = 'transformer_v2'
-  params.add_pos_encoding = True
-  # Num heads should be divisible by hidden size. This value should be tuned for
-  # the production setting. TODO: update this parameter after
-  # tuning.
-  params.num_heads = 2
-  params.layer_norm = False
-  params.dtype = dc_constants.TF_DATA_TYPE
-  params.condense_transformer_input = False
-  params.transformer_model_size = 'base'
-
-  params.num_channels = 1
-  params.use_bases = True
-  params.use_pw = True
-  params.use_ip = True
-  params.use_ccs = True
-  params.use_strand = True
-  params.use_sn = True
-  params.per_base_hidden_size = 1
-  params.pw_hidden_size = 1
-  params.ip_hidden_size = 1
-  params.sn_hidden_size = 1
-  params.strand_hidden_size = 1
-
-  # Training
-  params.batch_size = 256
-  params.num_epochs = 50
-  params.learning_rate = 1e-4
-  params.buffer_size = 1000
-
-
 def _set_base_transformer_hparams(params):
   """Updates given config with base values for the Transformer model."""
   # Architecture
   params.model_name = 'transformer'
   params.add_pos_encoding = True
-  params.use_relative_pos_enc = True
   # Num heads should be divisible by hidden size. This value should be tuned for
   # the production setting. TODO: update this parameter after
   # tuning.
@@ -135,28 +100,11 @@ def _set_base_transformer_hparams(params):
 
 
 def _set_transformer_learned_embeddings_hparams(params):
-  """Updates given config with values for the learned embeddings transformer."""
-  _set_base_transformer_hparams(params)
-  params.model_name = 'transformer_learn_values'
-  params.PW_MAX = dc_constants.PW_MAX
-  params.IP_MAX = dc_constants.IP_MAX
-  params.STRAND_MAX = dc_constants.STRAND_MAX
-  params.SN_MAX = dc_constants.SN_MAX
-  params.per_base_hidden_size = 8
-  params.pw_hidden_size = 8
-  params.ip_hidden_size = 8
-  params.strand_hidden_size = 2
-  params.sn_hidden_size = 8
-  params.condense_transformer_input = True
-  params.transformer_input_size = 280
-
-
-def _set_transformer_learned_embeddings_v2_hparams(params):
   """Updates given config with values for the learned embeddings transformer."""
   # TODO: As we migrate off the legacy code, we might need to
   # adjust the params below. For now just making a copy of the previous params.
-  _set_base_transformer_v2_hparams(params)
-  params.model_name = 'transformer_learn_values_v2'
+  _set_base_transformer_hparams(params)
+  params.model_name = 'transformer_learn_values'
   params.PW_MAX = dc_constants.PW_MAX
   params.IP_MAX = dc_constants.IP_MAX
   params.STRAND_MAX = dc_constants.STRAND_MAX
@@ -170,10 +118,10 @@ def _set_transformer_learned_embeddings_v2_hparams(params):
   params.transformer_input_size = 280
 
 
-def _set_transformer_learned_embeddings_v2_distill_hparams(params):
+def _set_transformer_learned_embeddings_distill_hparams(params):
   """Updates given config with values for the distilled transformer."""
-  _set_transformer_learned_embeddings_v2_hparams(params)
-  params.model_name = 'transformer_learn_values_v2_distill'
+  _set_transformer_learned_embeddings_hparams(params)
+  params.model_name = 'transformer_learn_values_distill'
 
   # Student architecture parameters.
   params.num_hidden_layers = 4
@@ -239,14 +187,12 @@ def get_config(config_name: str) -> ml_collections.ConfigDict:
 
   Valid config names must consist of two parts: {model_name}+{dataset_name}. The
   "+" must be present as a separator between the two parts. For example,
-  transformer_learn_bases+ccs is a valid name.
+  transformer_learn_values+ccs is a valid name.
 
   Valid model names include:
     * fc
-    * transformer (TODO: legacy codebase)
-    * transformer_learn_values (TODO: legacy codebase)
-    * transformer_v2
-    * transformer_learn_values_v2
+    * transformer
+    * transformer_learn_values
 
   Valid dataset names include:
     * ecoli
@@ -289,16 +235,12 @@ def get_config(config_name: str) -> ml_collections.ConfigDict:
   params.limit = -1
   if model_config_name == 'fc':
     _set_base_fc_hparams(params)
-  elif model_config_name == 'transformer_v2':
-    _set_base_transformer_v2_hparams(params)
   elif model_config_name == 'transformer':
     _set_base_transformer_hparams(params)
-  elif model_config_name == 'transformer_learn_values_v2':
-    _set_transformer_learned_embeddings_v2_hparams(params)
   elif model_config_name == 'transformer_learn_values':
     _set_transformer_learned_embeddings_hparams(params)
-  elif model_config_name == 'transformer_learn_values_v2_distill':
-    _set_transformer_learned_embeddings_v2_distill_hparams(params)
+  elif model_config_name == 'transformer_learn_values_distill':
+    _set_transformer_learned_embeddings_distill_hparams(params)
   else:
     raise ValueError('Unknown model_config_name: %s' % model_config_name)
 
diff --git a/deepconsensus/models/model_distillation.py b/deepconsensus/models/model_distillation.py
index 2addcaa..afef76d 100644
--- a/deepconsensus/models/model_distillation.py
+++ b/deepconsensus/models/model_distillation.py
@@ -30,12 +30,12 @@
 Distillation attempts to train a smaller student model that mimics the larger
 teacher model.
 
-Currently only transformer_learn_values_v2_distill config is
+Currently only transformer_learn_values_distill config is
 supported for model training.
 
 Example usage:
 
-CONFIG="//learning/genomics/deepconsensus/models/model_configs.py:transformer_learn_values_v2_distill+ccs"
+CONFIG="//learning/genomics/deepconsensus/models/model_configs.py:transformer_learn_values_distill+ccs"
 TEACHER_MODEL_DIR=""
 OUT_DIR=/tmp
 
diff --git a/deepconsensus/models/model_distillation_test.py b/deepconsensus/models/model_distillation_test.py
index 5281800..1becdf2 100644
--- a/deepconsensus/models/model_distillation_test.py
+++ b/deepconsensus/models/model_distillation_test.py
@@ -41,7 +41,7 @@
 
 class ModelTrainTest(parameterized.TestCase):
 
-  @parameterized.parameters(['transformer_learn_values_v2_distill+test'])
+  @parameterized.parameters(['transformer_learn_values_distill+test'])
   def test_train_e2e(self, config_name):
     """Tests that training completes and output files written."""
 
diff --git a/deepconsensus/models/model_inference_test.py b/deepconsensus/models/model_inference_test.py
index d3e1530..149eeba 100644
--- a/deepconsensus/models/model_inference_test.py
+++ b/deepconsensus/models/model_inference_test.py
@@ -43,7 +43,7 @@ class ModelInferenceTest(absltest.TestCase):
   def test_inference_e2e(self):
     """Tests that inference finishes running and an output file is created."""
 
-    config_name = 'transformer_learn_values_v2+test'
+    config_name = 'transformer_learn_values+test'
     out_dir = self.create_tempdir().full_path
     checkpoint_path = test_utils.deepconsensus_testdata('model/checkpoint-1')
     params = model_configs.get_config(config_name)
diff --git a/deepconsensus/models/model_train_custom_loop.py b/deepconsensus/models/model_train_custom_loop.py
index f388c10..ad02d3c 100644
--- a/deepconsensus/models/model_train_custom_loop.py
+++ b/deepconsensus/models/model_train_custom_loop.py
@@ -30,7 +30,7 @@
 To use this binary for training a specific model, the corresponding config file
 should be specified as input. Example usage:
 
-CONFIG="//learning/genomics/deepconsensus/models/model_configs.py:transformer_learn_values_v2+ccs"
+CONFIG="//learning/genomics/deepconsensus/models/model_configs.py:transformer_learn_values+ccs"
 OUT_DIR=/tmp
 
 time blaze run -c opt \
diff --git a/deepconsensus/models/model_utils.py b/deepconsensus/models/model_utils.py
index c26df06..2b8f142 100644
--- a/deepconsensus/models/model_utils.py
+++ b/deepconsensus/models/model_utils.py
@@ -31,18 +31,17 @@
 import json
 import logging
 import os
-from typing import List, Optional, Tuple, Any, Union, Dict
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import ml_collections
 import numpy as np
 import tensorflow as tf
 
 from deepconsensus.models import data_providers
-from deepconsensus.models import legacy_networks
 from deepconsensus.models import losses_and_metrics
 from deepconsensus.models import networks
+from deepconsensus.models import transformer_basic_params
 from deepconsensus.utils import dc_constants
-from official.nlp.transformer import misc
 
 
 def get_deepconsensus_loss(
@@ -119,16 +118,8 @@ def get_model(params: ml_collections.ConfigDict) -> tf.keras.Model:
   if params.model_name == 'fc':
     model = networks.FullyConnectedNet(params)
   elif params.model_name == 'transformer':
-    model = legacy_networks.EncoderOnlyTransformer(params)
-  # I'm using "_v2" suffix for the new code migrated out of legacy. Feel free
-  # to suggest more informative names.
-  elif params.model_name == 'transformer_v2':
     model = networks.EncoderOnlyTransformer(params)
-  elif params.model_name == 'transformer_learn_values':
-    model = legacy_networks.EncoderOnlyLearnedValuesTransformer(params)
-  # I'm using "_v2" suffix for the new code migrated out of legacy. Feel free
-  # to suggest more informative names.
-  elif 'transformer_learn_values_v2' in params.model_name:
+  elif 'transformer_learn_values' in params.model_name:
     model = networks.EncoderOnlyLearnedValuesTransformer(params)
   else:
     raise ValueError('Unknown model name: %s' % params.model_name)
@@ -228,8 +219,7 @@ def modify_params(params: ml_collections.ConfigDict,
       params.hidden_size += 1
 
     # Set model-specific parameters
-    if (params.model_name == 'transformer' or
-        params.model_name == 'transformer_v2'):
+    if params.model_name == 'transformer':
       # Transformer code uses default_batch_size, whereas my code uses
       # batch_size, so make sure both are the same.
       params.default_batch_size = params.batch_size
@@ -241,7 +231,7 @@ def modify_params(params: ml_collections.ConfigDict,
         logging.info('Setting hidden size to transformer_input_size.')
         params.hidden_size = params.transformer_input_size
     if 'transformer' in params.model_name:
-      transformer_params = misc.get_model_params(
+      transformer_params = get_transformer_model_params(
           params.transformer_model_size, num_gpus=num_gpus)
       # Only add hyperparameters that don't already exist.
       for param_name, param_value in transformer_params.items():
@@ -249,6 +239,25 @@ def modify_params(params: ml_collections.ConfigDict,
           params[param_name] = param_value
 
 
+def get_transformer_model_params(param_set, num_gpus):
+  """Gets predefined transformer model params."""
+  params_map = {
+      'tiny': transformer_basic_params.TINY_PARAMS,
+      'base': transformer_basic_params.BASE_PARAMS,
+      'big': transformer_basic_params.BIG_PARAMS,
+  }
+  if num_gpus > 1:
+    if param_set == 'big':
+      return transformer_basic_params.BIG_MULTI_GPU_PARAMS.copy()
+    elif param_set == 'base':
+      return transformer_basic_params.BASE_MULTI_GPU_PARAMS.copy()
+    else:
+      raise ValueError('Not valid params: param_set={} num_gpus={}'.format(
+          param_set, num_gpus))
+
+  return params_map[param_set].copy()
+
+
 def run_inference_and_write_results(model: tf.keras.Model,
                                     out_dir: str,
                                     params: ml_collections.ConfigDict,
diff --git a/deepconsensus/models/model_utils_test.py b/deepconsensus/models/model_utils_test.py
index e477fc4..251b3e2 100644
--- a/deepconsensus/models/model_utils_test.py
+++ b/deepconsensus/models/model_utils_test.py
@@ -88,7 +88,7 @@ def test_output_dir_created(self):
 
     out_dir = f'/tmp/output_dir/{uuid.uuid1()}'
     self.assertFalse(tf.io.gfile.isdir(out_dir))
-    params = model_configs.get_config('transformer_learn_values_v2+test')
+    params = model_configs.get_config('transformer_learn_values+test')
     model_utils.modify_params(params)
     model = model_utils.get_model(params)
     checkpoint_path = test_utils.deepconsensus_testdata('model/checkpoint-1')
diff --git a/deepconsensus/models/networks_test.py b/deepconsensus/models/networks_test.py
index 09e2bd3..c3cca97 100644
--- a/deepconsensus/models/networks_test.py
+++ b/deepconsensus/models/networks_test.py
@@ -62,7 +62,7 @@ class ModelsTest(parameterized.TestCase):
           [
               'fc+test',
               'transformer+test',
-              'transformer_learn_values_v2+test',
+              'transformer_learn_values+test',
           ],
           [True, False]))
   def test_outputs(self, training, config_name, use_predict):
@@ -100,7 +100,7 @@ def test_outputs(self, training, config_name, use_predict):
           [
               'fc+test',
               'transformer+test',
-              'transformer_learn_values_v2+test',
+              'transformer_learn_values+test',
           ],
           [True, False]))
   def test_predict_and_model_fn_equal(self, config_name, inference):
diff --git a/deepconsensus/models/transformer_basic_params.py b/deepconsensus/models/transformer_basic_params.py
new file mode 100644
index 0000000..a513a43
--- /dev/null
+++ b/deepconsensus/models/transformer_basic_params.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2021, Google Inc.
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+# 
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 
+# 3. Neither the name of Google Inc. nor the names of its contributors
+#    may be used to endorse or promote products derived from this software without
+#    specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""Defines Transformer basic model parameters for each model size."""
+
+import collections
+
+
+BASE_PARAMS = collections.defaultdict(
+    lambda: None,  # Set default value to None.
+
+    # Input params
+    default_batch_size=2048,  # Maximum number of tokens per batch of examples.
+    default_batch_size_tpu=32768,
+    max_length=256,  # Maximum number of tokens per example.
+
+    # Model params
+    initializer_gain=1.0,  # Used in trainable variable initialization.
+    vocab_size=33708,  # Number of tokens defined in the vocabulary file.
+    hidden_size=512,  # Model dimension in the hidden layers.
+    num_hidden_layers=6,  # Number of layers in the encoder and decoder stacks.
+    num_heads=8,  # Number of heads to use in multi-headed attention.
+    filter_size=2048,  # Inner layer dimension in the feedforward network.
+
+    # Dropout values (only used when training)
+    layer_postprocess_dropout=0.1,
+    attention_dropout=0.1,
+    relu_dropout=0.1,
+
+    # Training params
+    label_smoothing=0.1,
+    learning_rate=2.0,
+    learning_rate_decay_rate=1.0,
+    learning_rate_warmup_steps=16000,
+
+    # Optimizer params
+    optimizer_adam_beta1=0.9,
+    optimizer_adam_beta2=0.997,
+    optimizer_adam_epsilon=1e-09,
+
+    # Default prediction params
+    extra_decode_length=50,
+    beam_size=4,
+    alpha=0.6,  # used to calculate length normalization in beam search
+
+    # TPU specific parameters
+    use_tpu=False,
+    static_batch=False,
+    allow_ffn_pad=True,
+)
+
+BIG_PARAMS = BASE_PARAMS.copy()
+BIG_PARAMS.update(
+    default_batch_size=4096,
+
+    # default batch size is smaller than for BASE_PARAMS due to memory limits.
+    default_batch_size_tpu=16384,
+
+    hidden_size=1024,
+    filter_size=4096,
+    num_heads=16,
+)
+
+# Parameters for running the model in multi gpu. These should not change the
+# params that modify the model shape (such as the hidden_size or num_heads).
+BASE_MULTI_GPU_PARAMS = BASE_PARAMS.copy()
+BASE_MULTI_GPU_PARAMS.update(
+    learning_rate_warmup_steps=8000
+)
+
+BIG_MULTI_GPU_PARAMS = BIG_PARAMS.copy()
+BIG_MULTI_GPU_PARAMS.update(
+    layer_postprocess_dropout=0.3,
+    learning_rate_warmup_steps=8000
+)
+
+# Parameters for testing the model
+TINY_PARAMS = BASE_PARAMS.copy()
+TINY_PARAMS.update(
+    default_batch_size=1024,
+    default_batch_size_tpu=1024,
+    hidden_size=32,
+    num_heads=4,
+    filter_size=256,
+)
diff --git a/deepconsensus/testdata/README.md b/deepconsensus/testdata/README.md
index 33d78bc..8045751 100644
--- a/deepconsensus/testdata/README.md
+++ b/deepconsensus/testdata/README.md
@@ -27,7 +27,7 @@ This command should take ~6 min to complete.
 Generated with:
 
 ```bash
-MODEL=transformer_learn_values_v2
+MODEL=transformer_learn_values
 CONFIG="//learning/genomics/deepconsensus/models/model_configs.py:${MODEL}+test"
 TEMP_MODEL_DIR="/tmp/deepconsensus/model/$(TZ=US/Pacific date '+%Y%m%d%H%M%S')"
 MODEL_TRAIN_COMMAND="time blaze run -c opt \\
diff --git a/deepconsensus/testdata/model/params.json b/deepconsensus/testdata/model/params.json
index 12bd971..2b4e1d3 100644
--- a/deepconsensus/testdata/model/params.json
+++ b/deepconsensus/testdata/model/params.json
@@ -35,8 +35,8 @@
     "max_length": 120,
     "max_passes": 20,
     "model_checkpoint_freq": "epoch",
-    "model_config_name": "transformer_learn_values_v2",
-    "model_name": "transformer_learn_values_v2",
+    "model_config_name": "transformer_learn_values",
+    "model_name": "transformer_learn_values",
     "num_channels": 1,
     "num_classes": 5,
     "num_epochs": 50,
diff --git a/install-gpu.sh b/install-gpu.sh
index 2f1e84e..2d9acb8 100755
--- a/install-gpu.sh
+++ b/install-gpu.sh
@@ -57,4 +57,4 @@ echo "$(pip --version)"
 # Install python packages used by DeepConsensus.
 ################################################################################
 python3 -m pip install --user -r requirements.txt
-python3 -m pip install --user "tensorflow-gpu>=2.4.0,<=2.7.0"
+python3 -m pip install --user "tensorflow-gpu>=2.9.0"
diff --git a/install.sh b/install.sh
index caec79f..717f209 100755
--- a/install.sh
+++ b/install.sh
@@ -57,4 +57,4 @@ echo "$(pip --version)"
 # Install python packages used by DeepConsensus.
 ################################################################################
 python3 -m pip install --user -r requirements.txt
-python3 -m pip install --user "intel-tensorflow>=2.4.0,<=2.7.0"
+python3 -m pip install --user "intel-tensorflow>=2.9.0"
diff --git a/requirements.txt b/requirements.txt
index 7980264..f52fe42 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,14 @@
 numpy>=1.19
 pandas>=1.1
-tf-models-official>=2.4.0,<=2.7.0
+tf-models-official>=2.9.0
 ml_collections>=0.1.0
 absl-py>=0.13.0
-
+protobuf<3.20,>=3.9.2,<4,>=3.13
+flatbuffers<2,>=1.12
+keras<2.10.0,>=2.9.0rc0
+tensorflow-estimator<2.10.0,>=2.9.0rc0
+zipp>=3.1.0
+httplib2>=0.15.0
+httplib2<1dev,>=0.15.0
 pysam==0.19.0
+testresources
diff --git a/setup.py b/setup.py
index 1b53029..6ea20dd 100644
--- a/setup.py
+++ b/setup.py
@@ -42,8 +42,8 @@
 
 REQUIREMENTS = (here / 'requirements.txt').read_text().splitlines()
 EXTRA_REQUIREMENTS = {
-    'cpu': ['intel-tensorflow>=2.4.0,<=2.7.0'],
-    'gpu': ['tensorflow-gpu>=2.4.0,<=2.7.0']
+    'cpu': ['intel-tensorflow>=2.9.0'],
+    'gpu': ['tensorflow-gpu>=2.9.0']
 }