diff --git a/deepconsensus/cli.py b/deepconsensus/cli.py index 669dbd3..e535d89 100644 --- a/deepconsensus/cli.py +++ b/deepconsensus/cli.py @@ -35,6 +35,7 @@ preprocess: Convert aligned subreads to tf.Example format. run: Run DeepConsenseus beginning with aligned subreads. calibrate: Calculate base-quality calibration. + filter_reads: Filters reads from fastq or bam at a given q-value. """ import argparse @@ -46,7 +47,7 @@ from deepconsensus.utils import dc_constants -COMMANDS = ['preprocess', 'run', 'calibrate'] +COMMANDS = ['preprocess', 'run', 'calibrate', 'filter_reads'] def parse_flags(argv): @@ -105,6 +106,12 @@ def main(argset): calculate_baseq_calibration.register_required_flags() handle_help(passed, calculate_baseq_calibration) app.run(calculate_baseq_calibration.main, argv=passed) + elif args.command == 'filter_reads': + from deepconsensus.quality_calibration import filter_reads + + filter_reads.register_required_flags() + handle_help(passed, filter_reads) + app.run(filter_reads.main, argv=passed) def run(): diff --git a/deepconsensus/quality_calibration/filter_reads.py b/deepconsensus/quality_calibration/filter_reads.py new file mode 100644 index 0000000..90eef6e --- /dev/null +++ b/deepconsensus/quality_calibration/filter_reads.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python +# Copyright (c) 2021, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of Google Inc. nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +"""Filter reads based on the base qualities.""" + +import math +from absl import app +from absl import flags +from absl import logging +import pysam + +_INPUT_SEQ = flags.DEFINE_string( + 'input_seq', + short_name='i', + default=None, + help='Path to input fastq or bam file.', +) + +_OUTPUT_FASTQ = flags.DEFINE_string( + 'output_fastq', + short_name='o', + default=None, + help='Path to output fastq file.', +) + +_QUALITY_THRESHOLD = flags.DEFINE_integer( + 'quality_threshold', + short_name='q', + default=None, + help=( + 'A quality threshold value applied per read. Reads with read quality' + ' below this will be filtered out.' + ), +) + + +def register_required_flags(): + flags.mark_flags_as_required(['input_seq', 'output_fastq', + 'quality_threshold']) + + +def avg_phred(base_qualities: list[float]) -> float: + """Get the average phred quality given base qualities of a read. + + Args: + base_qualities: A list containing the base qualities of a read. + + Returns: + The average error rate of the read quality list. + """ + # Making sure we have the base quality. + if not base_qualities: + return 0 + return -10 * math.log10( + sum([10**(i / -10) for i in base_qualities]) / int(len(base_qualities))) + + +def filter_bam_or_fastq_by_quality( + input_seq: str, output_fastq: str, quality_threshold: int +): + """Filter reads of a fastq file based on base quality. + + Args: + input_seq: Path to a fastq or bam. + output_fastq: Path to a output fastq file. + quality_threshold: A threshold value. + """ + output_fastq = open(output_fastq, mode='w') + + total_reads = 0 + total_reads_above_q = 0 + if input_seq.endswith('.bam'): + input_file = pysam.AlignmentFile(input_seq, check_sq=False) + else: + input_file = pysam.FastxFile(input_seq) + with input_file as input_reads: + for read in input_reads: + total_reads += 1 + # Round the phred score to ensure expected behavior. Without rounding, a + # read with all base qualities equal to 10 will have an average phred of + # 9.99999 due to python floating point precision. Such a read would get + # filtered out if min_quality is 10. + if isinstance(read, pysam.AlignedSegment): + # Get avg phred from bam read. + phred = avg_phred(read.query_qualities) + else: + # Get avg phred from fastq quality scores. + phred = round(avg_phred(read.get_quality_array()), 5) + if phred >= quality_threshold: + total_reads_above_q += 1 + if isinstance(read, pysam.AlignedSegment): + record = '\n'.join( + ['@' + read.qname, read.query_sequence, '+', read.qual]) + '\n' + output_fastq.write(record) + else: + output_fastq.write(str(read) + '\n') + + output_fastq.close() + # Output from this script is piped directly to an output file. + logging.info('TOTAL READS IN INPUT: %d', total_reads) + logging.info('TOTAL READS IN OUTPUT: %d', total_reads_above_q) + logging.info( + 'TOTAL FILTERED READS: %d', + total_reads - total_reads_above_q, + ) + + +def main(unused_argv) -> None: + filter_bam_or_fastq_by_quality( + _INPUT_SEQ.value, + _OUTPUT_FASTQ.value, + _QUALITY_THRESHOLD.value, + ) + + +if __name__ == '__main__': + logging.use_python_logging() + register_required_flags() + app.run(main) diff --git a/deepconsensus/quality_calibration/filter_reads_test.py b/deepconsensus/quality_calibration/filter_reads_test.py new file mode 100644 index 0000000..7a77992 --- /dev/null +++ b/deepconsensus/quality_calibration/filter_reads_test.py @@ -0,0 +1,173 @@ +# Copyright (c) 2021, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of Google Inc. nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +"""Tests for filter_reads.""" + +import os + +from absl import flags +from absl.testing import absltest +from absl.testing import flagsaver +from absl.testing import parameterized +import pysam + +from deepconsensus.quality_calibration import filter_reads +from deepconsensus.utils import test_utils +from absl import app + + + +FLAGS = flags.FLAGS + + +class FilterReadsE2Etest(parameterized.TestCase): + """E2E tests for filter reads.""" + + def get_all_reads_from_file(self, filename): + """Returns all reads from a fastq file. + + Args: + filename: Path to input fastq file. + + Returns: + A list of reads in fastq file. + """ + input_file = pysam.FastxFile(filename) + with input_file as input_reads: + all_reads = list(input_reads) + return all_reads + + @parameterized.parameters( + dict( + expected_output_fastq=( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q0.fq.gz" + ), + flag_values={ + "input_seq": ( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz" + ), + "output_fastq": "output.q0.fq", + "quality_threshold": 0, + }, + ), + dict( + expected_output_fastq=( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q10.fq.gz" + ), + flag_values={ + "input_seq": ( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz" + ), + "output_fastq": "output.q10.fq", + "quality_threshold": 10, + }, + ), + dict( + expected_output_fastq=( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q20.fq.gz" + ), + flag_values={ + "input_seq": ( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz" + ), + "output_fastq": "output.q20.fq", + "quality_threshold": 20, + }, + ), + dict( + expected_output_fastq=( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q30.fq.gz" + ), + flag_values={ + "input_seq": ( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz" + ), + "output_fastq": "output.q30.fq", + "quality_threshold": 30, + }, + ), + dict( + expected_output_fastq=( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q40.fq.gz" + ), + flag_values={ + "input_seq": ( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz" + ), + "output_fastq": "output.q40.fq", + "quality_threshold": 40, + }, + ), + dict( + expected_output_fastq=( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q50.fq.gz" + ), + flag_values={ + "input_seq": ( + "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz" + ), + "output_fastq": "output.q50.fq", + "quality_threshold": 50, + }, + ), + dict( + expected_output_fastq=( + "filter_fastq/m64062_190806_063919-chr20.dc.small.q30.fq.gz" + ), + flag_values={ + "input_seq": ( + "filter_fastq/m64062_190806_063919-chr20.dc.small.bam" + ), + "output_fastq": "output.bam.q30.fq", + "quality_threshold": 30, + }, + ), + ) + @flagsaver.flagsaver + def test_filter_fastq_golden(self, expected_output_fastq, flag_values): + """Test filter_fastq method.""" + # Set flag values. + FLAGS.input_seq = test_utils.deepconsensus_testdata( + flag_values["input_seq"] + ) + # Create a tmp dir. + tmp_dir = self.create_tempdir() + output = os.path.join(tmp_dir, flag_values["output_fastq"]) + FLAGS.output_fastq = output + FLAGS.quality_threshold = flag_values["quality_threshold"] + filter_reads.main([]) + expected_fastq = test_utils.deepconsensus_testdata(expected_output_fastq) + expected_read_list = self.get_all_reads_from_file(expected_fastq) + read_list = self.get_all_reads_from_file(output) + for filtered_read, expected_read in zip(read_list, expected_read_list): + self.assertEqual(filtered_read.name, expected_read.name) + self.assertEqual(filtered_read.quality, expected_read.quality) + self.assertEqual(filtered_read.sequence, expected_read.sequence) + + +if __name__ == "__main__": + absltest.main() diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919-chr20.dc.small.bam b/deepconsensus/testdata/filter_fastq/m64062_190806_063919-chr20.dc.small.bam new file mode 100644 index 0000000..0ea2e14 Binary files /dev/null and b/deepconsensus/testdata/filter_fastq/m64062_190806_063919-chr20.dc.small.bam differ diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919-chr20.dc.small.q30.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919-chr20.dc.small.q30.fq.gz new file mode 100644 index 0000000..a04b201 Binary files /dev/null and b/deepconsensus/testdata/filter_fastq/m64062_190806_063919-chr20.dc.small.q30.fq.gz differ diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz new file mode 100644 index 0000000..90a4a41 Binary files /dev/null and b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz differ diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q0.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q0.fq.gz new file mode 100644 index 0000000..90a4a41 Binary files /dev/null and b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q0.fq.gz differ diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q10.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q10.fq.gz new file mode 100644 index 0000000..90a4a41 Binary files /dev/null and b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q10.fq.gz differ diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q20.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q20.fq.gz new file mode 100644 index 0000000..65d3663 Binary files /dev/null and b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q20.fq.gz differ diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q30.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q30.fq.gz new file mode 100644 index 0000000..8c052e0 Binary files /dev/null and b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q30.fq.gz differ diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q40.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q40.fq.gz new file mode 100644 index 0000000..e087a31 Binary files /dev/null and b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q40.fq.gz differ diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q50.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q50.fq.gz new file mode 100644 index 0000000..a776e8d Binary files /dev/null and b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q50.fq.gz differ