[go: nahoru, domu]

Skip to content

Commit

Permalink
Uncap base quality scores to 93 by default.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 508203931
  • Loading branch information
anastasiyabl authored and Copybara-Service committed Feb 8, 2023
1 parent 02274ba commit 8b87555
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 10 deletions.
26 changes: 21 additions & 5 deletions deepconsensus/inference/quick_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,12 @@ class DebugStage(enum.Enum):
'on the 3rd GPU, you can set this to 2. By default, if you have GPUs, the '
'lowest index one would be used.')

# The following parameters are for quality score calibration

# The following parameters are for base qualities and their calibration.
flags.DEFINE_integer(
'max_base_quality',
93,
'Base qualities will be capped at this quality value.',
)
flags.DEFINE_string(
'dc_calibration', None, 'If set to None, base quality values will be read '
'from model params.json if available. Set to "skip" to perform no quality '
Expand Down Expand Up @@ -201,6 +205,7 @@ class InferenceOptions:
window is below this value.
use_saved_model: True if the given checkpoint is a saved model, false if it
is a regular checkpoint.
max_base_quality: The maximum base quality value allowed.
dc_calibration_values: QualityCalibrationValues defining values to be used
for deepconsensus quality calibration.
ccs_calibration_values: QualityCalibrationValues defining values to be used
Expand All @@ -216,6 +221,7 @@ class InferenceOptions:
cpus: int
skip_windows_above: int
use_saved_model: bool
max_base_quality: int
dc_calibration_values: calibration_lib.QualityCalibrationValues
ccs_calibration_values: calibration_lib.QualityCalibrationValues

Expand Down Expand Up @@ -322,9 +328,12 @@ def run_model_on_examples(
if options.dc_calibration_values.enabled:
quality_scores = calibration_lib.calibrate_quality_scores(
quality_scores, options.dc_calibration_values)
quality_scores = np.minimum(quality_scores, dc_constants.MAX_QUAL)
# The maximum value for base quality scores is max_base_quality.
quality_scores = np.minimum(quality_scores, options.max_base_quality)
quality_scores = np.round(quality_scores, decimals=0)
quality_scores = quality_scores.astype(dtype=np.int32)
# The minimum value for base quality scores is 0.
quality_scores = np.maximum(quality_scores, 0)
for y_pred, qs, window_pos, molecule_name, ec, np_, rq, rg in zip(
y_preds, quality_scores, window_pos_arr, molecule_name_arr, ec_arr,
np_num_passes_arr, rq_arr, rg_arr):
Expand Down Expand Up @@ -498,7 +507,7 @@ def process_skipped_window(
if options.ccs_calibration_values.enabled:
ccs_quality_scores = calibration_lib.calibrate_quality_scores(
ccs_quality_scores, options.ccs_calibration_values)
ccs_quality_scores = np.minimum(ccs_quality_scores, dc_constants.MAX_QUAL)
ccs_quality_scores = np.minimum(ccs_quality_scores, options.max_base_quality)
ccs_quality_scores = ccs_quality_scores.astype(dtype=np.int32)
dc_output = stitch_utils.DCModelOutput(
window_pos=feature_dict['window_pos'],
Expand Down Expand Up @@ -707,6 +716,11 @@ def run() -> stitch_utils.OutcomeCounter:
params.use_ccs_bq,
)

if not FLAGS.max_base_quality:
raise ValueError(
'--max_base_quality should be set to a valid number. If unset bases'
' with error probability of 0, will result in invalid quality scores.'
)
# Attempt to read default calibration values from model params.json.
# If not found, set to 'skip'.
if FLAGS.dc_calibration is None:
Expand Down Expand Up @@ -735,9 +749,11 @@ def run() -> stitch_utils.OutcomeCounter:
cpus=FLAGS.cpus,
skip_windows_above=FLAGS.skip_windows_above,
use_saved_model=use_saved_model,
max_base_quality=FLAGS.max_base_quality,
dc_calibration_values=dc_calibration_values,
use_ccs_bq=params.use_ccs_bq,
ccs_calibration_values=ccs_calibration_values)
ccs_calibration_values=ccs_calibration_values,
)
outcome_counter = stitch_utils.OutcomeCounter()
stats_counter = collections.Counter()

Expand Down
14 changes: 10 additions & 4 deletions deepconsensus/postprocess/stitch_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,9 @@ def test_is_quality_above_threshold(self, min_quality, read_qualities,

class ConvertToFastqStrDoFnTest(absltest.TestCase):

def assert_correct_fastq_str(self, molecule_name, fasta_str):
def assert_correct_fastq_str(
self, molecule_name, fasta_str, max_base_quality
):
fasta_str_parts = fasta_str.split('\n')
contig_name = fasta_str_parts[0]
# Check that the contig name is formatted correctly.
Expand All @@ -179,7 +181,8 @@ def assert_correct_fastq_str(self, molecule_name, fasta_str):
# Not all values in this range are allowed, since we are binning, but we
# do not consider that for this test.
possible_quals = utils.quality_scores_to_string(
np.array(range(dc_constants.EMPTY_QUAL, dc_constants.MAX_QUAL + 1)))
np.array(range(dc_constants.EMPTY_QUAL, max_base_quality + 1))
)
self.assertContainsSubset(quality_string_line, possible_quals)
self.assertLen(quality_string_line, len(sequence_line))

Expand All @@ -192,9 +195,12 @@ def test_convert_to_fastq_str(self):
output = stitch_utils.format_as_fastq(
molecule_name=molecule_name,
sequence=sequence,
quality_string=quality_string)
quality_string=quality_string,
)

self.assert_correct_fastq_str(molecule_name=molecule_name, fasta_str=output)
self.assert_correct_fastq_str(
molecule_name=molecule_name, fasta_str=output, max_base_quality=93
)


if __name__ == '__main__':
Expand Down
1 change: 0 additions & 1 deletion deepconsensus/utils/dc_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ class Strand(int, enum.Enum):
'ccs_base_quality_scores', 'ec', 'np_num_passes', 'rq', 'rg'
]

MAX_QUAL = 40
EMPTY_QUAL = 0

# The name of the eval metric to optimize (choosing checkpoints and vizier).
Expand Down

0 comments on commit 8b87555

Please sign in to comment.