[go: nahoru, domu]

Skip to content

Commit

Permalink
Finalize CCS BQ Integration.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 506035527
  • Loading branch information
danielecook authored and Copybara-Service committed Jan 31, 2023
1 parent ad16825 commit 63df275
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 10 deletions.
29 changes: 20 additions & 9 deletions deepconsensus/inference/quick_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ class InferenceOptions:
min_quality: Quality threshold to filter final reads.
min_length: Length threshold to filter final reads.
batch_size: Number of examples passed through model at once.
use_ccs_bq: Use CCS Base Quality Scores as a feature.
cpus: Number of processes to use for multiprocessing. Must be positive (for
multiprocessing) or 0 (for serial execution).
skip_windows_above: Run the model only when the avg(ccs_base_qual) of the
Expand All @@ -211,6 +212,7 @@ class InferenceOptions:
min_quality: int
min_length: int
batch_size: int
use_ccs_bq: bool
cpus: int
skip_windows_above: int
use_saved_model: bool
Expand Down Expand Up @@ -386,7 +388,9 @@ def stream_bam(
"""

dc_config = pre_lib.DcConfig(
max_passes=options.max_passes, max_length=options.max_length)
max_passes=options.max_passes,
max_length=options.max_length,
use_ccs_bq=options.use_ccs_bq)

# Temporarily disable unused-variable.
# pylint: disable=unused-variable
Expand Down Expand Up @@ -421,6 +425,12 @@ def initialize_model(
if FLAGS.end_after_stage in [DebugStage.TF_EXAMPLES, DebugStage.DC_INPUT]:
return None, None

model_utils.modify_params(
params=params,
speedy=True,
max_length=options.max_length,
is_training=False)

logging.info('Loading %s', checkpoint_path)
if options.use_saved_model:
model = tf.saved_model.load(checkpoint_path)
Expand All @@ -440,11 +450,6 @@ def initialize_model(
checkpoint.restore(
checkpoint_path).expect_partial().assert_existing_objects_matched()

model_utils.modify_params(
params=params,
speedy=True,
max_length=options.max_length,
is_training=False)
logging.info('Finished initialize_model.')
return model, params

Expand Down Expand Up @@ -485,7 +490,9 @@ def process_skipped_window(
options: InferenceOptions) -> stitch_utils.DCModelOutput:
"""Process a window by simply adopting the CCS sequence and base qualities."""
rows = feature_dict['subreads']
ccs = rows[-5, :, 0]
_, _, _, _, ccs_index, _, _ = data_providers.get_indices(
options.max_passes, options.use_ccs_bq)
ccs = rows[ccs_index[0], :, 0]
ccs_seq = utils.encoded_sequence_to_string(ccs)
ccs_quality_scores = feature_dict['ccs_base_quality_scores']
if options.ccs_calibration_values.enabled:
Expand Down Expand Up @@ -694,8 +701,11 @@ def run() -> stitch_utils.OutcomeCounter:

# Load model parameters
params = model_utils.read_params_from_json(checkpoint_path=FLAGS.checkpoint)

dc_config = pre_lib.DcConfig(params.max_passes, params.max_length)
dc_config = pre_lib.DcConfig(
params.max_passes,
params.max_length,
params.use_ccs_bq,
)

# Attempt to read default calibration values from model params.json.
# If not found, set to 'skip'.
Expand Down Expand Up @@ -726,6 +736,7 @@ def run() -> stitch_utils.OutcomeCounter:
skip_windows_above=FLAGS.skip_windows_above,
use_saved_model=use_saved_model,
dc_calibration_values=dc_calibration_values,
use_ccs_bq=params.use_ccs_bq,
ccs_calibration_values=ccs_calibration_values)
outcome_counter = stitch_utils.OutcomeCounter()
stats_counter = collections.Counter()
Expand Down
27 changes: 27 additions & 0 deletions deepconsensus/models/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,31 @@ def _set_test_data_hparams(params):
params.fc_size = [4, 4]


def _set_test_bq_data_hparams(params):
"""Updates the given config with values for a test dataset."""
curr_dir = os.path.dirname(__file__)
params.use_ccs_bq = True
params.train_path = [
os.path.join(curr_dir, '../testdata/human_1m/tf_examples_bq/train/*')
]
# Use same data for train/eval/hard eval because the eval test data is empty.
params.eval_path = params.train_path
params.test_path = params.train_path
params.inference_path = os.path.join(
curr_dir, '../testdata/human_1m/tf_examples_bq/inference/*')
params.n_examples_train = 253
params.n_examples_eval = 253
params.max_passes = 20

# The test dataset uniquely sets these model-level parameters because the test
# dataset is small and we want to keep model files small.
params.batch_size = 1
params.num_epochs = 1
params.buffer_size = 10
if params.model_name == 'fc':
params.fc_size = [4, 4]


############### Core function for setting all config values ###############


Expand Down Expand Up @@ -338,6 +363,8 @@ def get_config(config_name: Optional[str] = None) -> ml_collections.ConfigDict:
_set_ecoli_data_hparams(params)
elif dataset_config_name == 'test':
_set_test_data_hparams(params)
elif dataset_config_name == 'test_bq':
_set_test_bq_data_hparams(params)
elif dataset_config_name == 'custom':
_set_custom_data_hparams(params)
else:
Expand Down
19 changes: 18 additions & 1 deletion deepconsensus/models/networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,19 +351,29 @@ def __init__(self,
vocab_size=dc_constants.SEQ_VOCAB_SIZE,
embedding_width=params['per_base_hidden_size'],
name='bases_embedding')

if params.use_pw:
pw_vocab_size = params.PW_MAX + 1
self.pw_embedding_layer = ModifiedOnDeviceEmbedding(
vocab_size=pw_vocab_size,
embedding_width=params['pw_hidden_size'],
name='pw_embedding')

if params.use_ip:
ip_vocab_size = params.IP_MAX + 1
self.ip_embedding_layer = ModifiedOnDeviceEmbedding(
vocab_size=ip_vocab_size,
embedding_width=params['ip_hidden_size'],
name='ip_embedding')

if params.use_ccs_bq:
# Values range from -1 to 93; So 95 distinct values.
ccs_bq_scores_vocab_size = params.CCS_BQ_MAX
self.ccs_base_quality_scores_embedding_layer = ModifiedOnDeviceEmbedding(
vocab_size=ccs_bq_scores_vocab_size,
embedding_width=params['ccs_bq_hidden_size'],
name='ccs_base_quality_scores_embedding')

if params.use_sn:
sn_vocab_size = params.SN_MAX + 1
self.sn_embedding_layer = ModifiedOnDeviceEmbedding(
Expand Down Expand Up @@ -404,7 +414,7 @@ def encode(self, inputs: tf.Tensor, attention_bias: tf.Tensor,
ip_indices,
strand_indices,
ccs_indices,
_,
ccs_bq_indices,
sn_indices,
) = data_providers.get_indices(
self.params.max_passes,
Expand All @@ -429,6 +439,13 @@ def encode(self, inputs: tf.Tensor, attention_bias: tf.Tensor,
embedded = self.ip_embedding_layer(tf.cast(inputs[:, :, i], tf.int32))
embedded_inputs.append(embedded)

if self.params.use_ccs_bq:
for i in range(*ccs_bq_indices):
# Add 1 to ccs base quality scores to shift gaps from -1 to 0.
embedded = self.ccs_base_quality_scores_embedding_layer(
tf.cast(inputs[:, :, i] + 1, tf.int32))
embedded_inputs.append(embedded)

if self.params.use_strand:
for i in range(*strand_indices):
embedded = self.strand_embedding_layer(
Expand Down
2 changes: 2 additions & 0 deletions deepconsensus/testdata/model_bq/checkpoint
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
model_checkpoint_path: "checkpoint-1"
all_model_checkpoint_paths: "checkpoint-1"
Binary file not shown.
5 changes: 5 additions & 0 deletions deepconsensus/testdata/model_bq/checkpoint_metrics.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
checkpoint_name group name value
/tmp/deepconsensus/model/20230130204511/checkpoint-1 eval eval/loss 168.41236877441406
/tmp/deepconsensus/model/20230130204511/checkpoint-1 eval eval/per_example_accuracy 0.0
/tmp/deepconsensus/model/20230130204511/checkpoint-1 eval eval/per_batch_alignment_identity 0.3844282031059265
/tmp/deepconsensus/model/20230130204511/checkpoint-1 eval eval/yield_over_ccs 0.0
86 changes: 86 additions & 0 deletions deepconsensus/testdata/model_bq/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
{
"CCS_BQ_MAX": 95,
"IP_MAX": 255,
"PW_MAX": 255,
"SN_MAX": 500,
"STRAND_MAX": 2,
"add_pos_encoding": true,
"allow_ffn_pad": true,
"alpha": 0.6,
"attention_dropout": 0.1,
"attn_win_size": 12,
"band_width": null,
"batch_size": 1,
"beam_size": 4,
"beta_1": 0.9,
"beta_2": 0.999,
"buffer_size": 10,
"ccs_bq_hidden_size": 8,
"condense_transformer_input": true,
"conv_model": "resnet50",
"dataset_config_name": "test_bq",
"default_batch_size": 1,
"default_batch_size_tpu": 32768,
"del_cost": 10,
"dnabert_desired_hidden_size": 0,
"end_learning_rate": 2.86594e-05,
"epsilon": 1e-06,
"extra_decode_length": 50,
"filter_size": 2048,
"hidden_size": 280,
"initial_learning_rate": 0.0036246,
"initializer_gain": 1,
"ip_hidden_size": 8,
"label_smoothing": 0.1,
"layer_norm": false,
"layer_postprocess_dropout": 0.1,
"learning_rate": 2,
"learning_rate_decay_rate": 1,
"learning_rate_warmup_steps": 16000,
"limit": -1,
"loss_function": "alignment_loss",
"loss_reg": 0.1,
"max_length": 100,
"max_passes": 20,
"model_checkpoint_freq": "epoch",
"model_config_name": "transformer_learn_values",
"model_name": "transformer_learn_values",
"n_examples_eval": 253,
"n_examples_train": 253,
"num_channels": 1,
"num_epochs": 1,
"num_epochs_for_decay": 7,
"num_heads": 2,
"num_hidden_layers": 6,
"optimizer_adam_beta1": 0.9,
"optimizer_adam_beta2": 0.997,
"optimizer_adam_epsilon": 1e-09,
"per_base_hidden_size": 8,
"pw_hidden_size": 8,
"relu_dropout": 0.1,
"remove_label_gaps": false,
"rezero": true,
"seed": 1,
"sn_hidden_size": 8,
"static_batch": false,
"strand_hidden_size": 2,
"tensorboard_update_freq": "batch",
"total_rows": 86,
"tpu_scale_factor": 1,
"transformer_input_size": 280,
"transformer_model_size": "base",
"trial": 1,
"use_bases": true,
"use_ccs": true,
"use_ccs_bq": true,
"use_dnabert": false,
"use_ip": true,
"use_pw": true,
"use_sn": true,
"use_strand": true,
"use_tpu": false,
"vocab_size": 5,
"warmup_steps": 35536,
"weight_decay_rate": 0.0069868,
"dc_calibration": "0,1.197654,-0.99781"
}

0 comments on commit 63df275

Please sign in to comment.