Add filter_reads capability to DeepConsensus CLI.

PiperOrigin-RevId: 512641889
google · Feb 27, 2023 · 06f08a7 · 06f08a7
1 parent 0750611
commit 06f08a7
Show file tree

Hide file tree

Showing 12 changed files with 325 additions and 1 deletion.
diff --git a/deepconsensus/cli.py b/deepconsensus/cli.py
@@ -35,6 +35,7 @@
   preprocess: Convert aligned subreads to tf.Example format.
   run: Run DeepConsenseus beginning with aligned subreads.
   calibrate: Calculate base-quality calibration.
+  filter_reads: Filters reads from fastq or bam at a given q-value.
 """
 
 import argparse
@@ -46,7 +47,7 @@
 
 from deepconsensus.utils import dc_constants
 
-COMMANDS = ['preprocess', 'run', 'calibrate']
+COMMANDS = ['preprocess', 'run', 'calibrate', 'filter_reads']
 
 
 def parse_flags(argv):
@@ -105,6 +106,12 @@ def main(argset):
     calculate_baseq_calibration.register_required_flags()
     handle_help(passed, calculate_baseq_calibration)
     app.run(calculate_baseq_calibration.main, argv=passed)
+  elif args.command == 'filter_reads':
+    from deepconsensus.quality_calibration import filter_reads
+
+    filter_reads.register_required_flags()
+    handle_help(passed, filter_reads)
+    app.run(filter_reads.main, argv=passed)
 
 
 def run():

diff --git a/deepconsensus/quality_calibration/filter_reads.py b/deepconsensus/quality_calibration/filter_reads.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+# Copyright (c) 2021, Google Inc.
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+# 
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 
+# 3. Neither the name of Google Inc. nor the names of its contributors
+#    may be used to endorse or promote products derived from this software without
+#    specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""Filter reads based on the base qualities."""
+
+import math
+from absl import app
+from absl import flags
+from absl import logging
+import pysam
+
+_INPUT_SEQ = flags.DEFINE_string(
+    'input_seq',
+    short_name='i',
+    default=None,
+    help='Path to input fastq or bam file.',
+)
+
+_OUTPUT_FASTQ = flags.DEFINE_string(
+    'output_fastq',
+    short_name='o',
+    default=None,
+    help='Path to output fastq file.',
+)
+
+_QUALITY_THRESHOLD = flags.DEFINE_integer(
+    'quality_threshold',
+    short_name='q',
+    default=None,
+    help=(
+        'A quality threshold value applied per read. Reads with read quality'
+        ' below this will be filtered out.'
+    ),
+)
+
+
+def register_required_flags():
+  flags.mark_flags_as_required(['input_seq', 'output_fastq',
+                                'quality_threshold'])
+
+
+def avg_phred(base_qualities: list[float]) -> float:
+  """Get the average phred quality given base qualities of a read.
+
+  Args:
+     base_qualities: A list containing the base qualities of a read.
+
+  Returns:
+     The average error rate of the read quality list.
+  """
+  # Making sure we have the base quality.
+  if not base_qualities:
+    return 0
+  return -10 * math.log10(
+      sum([10**(i / -10) for i in base_qualities]) / int(len(base_qualities)))
+
+
+def filter_bam_or_fastq_by_quality(
+    input_seq: str, output_fastq: str, quality_threshold: int
+):
+  """Filter reads of a fastq file based on base quality.
+
+  Args:
+    input_seq: Path to a fastq or bam.
+    output_fastq: Path to a output fastq file.
+    quality_threshold: A threshold value.
+  """
+  output_fastq = open(output_fastq, mode='w')
+
+  total_reads = 0
+  total_reads_above_q = 0
+  if input_seq.endswith('.bam'):
+    input_file = pysam.AlignmentFile(input_seq, check_sq=False)
+  else:
+    input_file = pysam.FastxFile(input_seq)
+  with input_file as input_reads:
+    for read in input_reads:
+      total_reads += 1
+      # Round the phred score to ensure expected behavior. Without rounding, a
+      # read with all base qualities equal to 10 will have an average phred of
+      # 9.99999 due to python floating point precision. Such a read would get
+      # filtered out if min_quality is 10.
+      if isinstance(read, pysam.AlignedSegment):
+        # Get avg phred from bam read.
+        phred = avg_phred(read.query_qualities)
+      else:
+        # Get avg phred from fastq quality scores.
+        phred = round(avg_phred(read.get_quality_array()), 5)
+      if phred >= quality_threshold:
+        total_reads_above_q += 1
+        if isinstance(read, pysam.AlignedSegment):
+          record = '\n'.join(
+              ['@' + read.qname, read.query_sequence, '+', read.qual]) + '\n'
+          output_fastq.write(record)
+        else:
+          output_fastq.write(str(read) + '\n')
+
+  output_fastq.close()
+  # Output from this script is piped directly to an output file.
+  logging.info('TOTAL READS IN INPUT: %d', total_reads)
+  logging.info('TOTAL READS IN OUTPUT: %d', total_reads_above_q)
+  logging.info(
+      'TOTAL FILTERED READS: %d',
+      total_reads - total_reads_above_q,
+  )
+
+
+def main(unused_argv) -> None:
+  filter_bam_or_fastq_by_quality(
+      _INPUT_SEQ.value,
+      _OUTPUT_FASTQ.value,
+      _QUALITY_THRESHOLD.value,
+  )
+
+
+if __name__ == '__main__':
+  logging.use_python_logging()
+  register_required_flags()
+  app.run(main)
diff --git a/deepconsensus/quality_calibration/filter_reads_test.py b/deepconsensus/quality_calibration/filter_reads_test.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2021, Google Inc.
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+# 
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 
+# 3. Neither the name of Google Inc. nor the names of its contributors
+#    may be used to endorse or promote products derived from this software without
+#    specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""Tests for filter_reads."""
+
+import os
+
+from absl import flags
+from absl.testing import absltest
+from absl.testing import flagsaver
+from absl.testing import parameterized
+import pysam
+
+from deepconsensus.quality_calibration import filter_reads
+from deepconsensus.utils import test_utils
+from absl import app
+
+
+
+FLAGS = flags.FLAGS
+
+
+class FilterReadsE2Etest(parameterized.TestCase):
+  """E2E tests for filter reads."""
+
+  def get_all_reads_from_file(self, filename):
+    """Returns all reads from a fastq file.
+
+    Args:
+      filename: Path to input fastq file.
+
+    Returns:
+        A list of reads in fastq file.
+    """
+    input_file = pysam.FastxFile(filename)
+    with input_file as input_reads:
+      all_reads = list(input_reads)
+    return all_reads
+
+  @parameterized.parameters(
+      dict(
+          expected_output_fastq=(
+              "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q0.fq.gz"
+          ),
+          flag_values={
+              "input_seq": (
+                  "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz"
+              ),
+              "output_fastq": "output.q0.fq",
+              "quality_threshold": 0,
+          },
+      ),
+      dict(
+          expected_output_fastq=(
+              "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q10.fq.gz"
+          ),
+          flag_values={
+              "input_seq": (
+                  "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz"
+              ),
+              "output_fastq": "output.q10.fq",
+              "quality_threshold": 10,
+          },
+      ),
+      dict(
+          expected_output_fastq=(
+              "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q20.fq.gz"
+          ),
+          flag_values={
+              "input_seq": (
+                  "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz"
+              ),
+              "output_fastq": "output.q20.fq",
+              "quality_threshold": 20,
+          },
+      ),
+      dict(
+          expected_output_fastq=(
+              "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q30.fq.gz"
+          ),
+          flag_values={
+              "input_seq": (
+                  "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz"
+              ),
+              "output_fastq": "output.q30.fq",
+              "quality_threshold": 30,
+          },
+      ),
+      dict(
+          expected_output_fastq=(
+              "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q40.fq.gz"
+          ),
+          flag_values={
+              "input_seq": (
+                  "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz"
+              ),
+              "output_fastq": "output.q40.fq",
+              "quality_threshold": 40,
+          },
+      ),
+      dict(
+          expected_output_fastq=(
+              "filter_fastq/m64062_190806_063919_q0_chr20_100reads.q50.fq.gz"
+          ),
+          flag_values={
+              "input_seq": (
+                  "filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz"
+              ),
+              "output_fastq": "output.q50.fq",
+              "quality_threshold": 50,
+          },
+      ),
+      dict(
+          expected_output_fastq=(
+              "filter_fastq/m64062_190806_063919-chr20.dc.small.q30.fq.gz"
+          ),
+          flag_values={
+              "input_seq": (
+                  "filter_fastq/m64062_190806_063919-chr20.dc.small.bam"
+              ),
+              "output_fastq": "output.bam.q30.fq",
+              "quality_threshold": 30,
+          },
+      ),
+  )
+  @flagsaver.flagsaver
+  def test_filter_fastq_golden(self, expected_output_fastq, flag_values):
+    """Test filter_fastq method."""
+    # Set flag values.
+    FLAGS.input_seq = test_utils.deepconsensus_testdata(
+        flag_values["input_seq"]
+    )
+    # Create a tmp dir.
+    tmp_dir = self.create_tempdir()
+    output = os.path.join(tmp_dir, flag_values["output_fastq"])
+    FLAGS.output_fastq = output
+    FLAGS.quality_threshold = flag_values["quality_threshold"]
+    filter_reads.main([])
+    expected_fastq = test_utils.deepconsensus_testdata(expected_output_fastq)
+    expected_read_list = self.get_all_reads_from_file(expected_fastq)
+    read_list = self.get_all_reads_from_file(output)
+    for filtered_read, expected_read in zip(read_list, expected_read_list):
+      self.assertEqual(filtered_read.name, expected_read.name)
+      self.assertEqual(filtered_read.quality, expected_read.quality)
+      self.assertEqual(filtered_read.sequence, expected_read.sequence)
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919-chr20.dc.small.bam b/deepconsensus/testdata/filter_fastq/m64062_190806_063919-chr20.dc.small.bam
diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919-chr20.dc.small.q30.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919-chr20.dc.small.q30.fq.gz
diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.fq.gz
diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q0.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q0.fq.gz
diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q10.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q10.fq.gz
diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q20.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q20.fq.gz
diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q30.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q30.fq.gz
diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q40.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q40.fq.gz
diff --git a/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q50.fq.gz b/deepconsensus/testdata/filter_fastq/m64062_190806_063919_q0_chr20_100reads.q50.fq.gz