[go: nahoru, domu]

Skip to content

Commit

Permalink
Count trimmed insertions
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 473789630
  • Loading branch information
danielecook authored and Copybara-Service committed Sep 12, 2022
1 parent c0f8cc4 commit 3a8bd9a
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 17 deletions.
28 changes: 21 additions & 7 deletions deepconsensus/preprocess/pre_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import collections
from collections import abc
import dataclasses
import functools
import itertools
from typing import Any, Counter, Dict, List, Optional, Union

Expand Down Expand Up @@ -823,8 +824,10 @@ def read_truth_split(split_fname: str) -> Dict[str, str]:
return contig_split


def trim_insertions(read: pysam.AlignedSegment,
ins_trim: int) -> pysam.AlignedSegment:
def trim_insertions(
read: pysam.AlignedSegment,
ins_trim: int,
counter: Optional[Counter[str]] = None) -> pysam.AlignedSegment:
"""Remove insertions larger than max length.
This operation effectively
Expand All @@ -833,6 +836,7 @@ def trim_insertions(read: pysam.AlignedSegment,
Args:
read: pysam.AlignedSegment subread aligned to the ccs.
ins_trim: int maximum length of insertions.
counter: A counter object used to track information about processing data.
Returns:
pysam.AlignedSegment. Modified pysam.AlignedSegment subread.
Expand All @@ -858,12 +862,17 @@ def trim_insertions(read: pysam.AlignedSegment,
# Trim to zero, so this cigar operation is removed completely.
mask[seq_pos:seq_pos + op_len] = [False] * op_len
seq_pos += op_len
if counter is not None:
counter['zmw_trimmed_insertions'] += 1
counter['zmw_trimmed_insertions_bp'] += op_len
else:
trimmed_cigar.append((cigar_op, op_len))
if cigar_op not in [dc_constants.PYSAM_CDEL]:
trimmed_seq += read.query_sequence[seq_pos:seq_pos + op_len]
seq_pos += op_len

if counter is not None:
counter['zmw_total_bp'] += op_len
op_start += op_len

if pw_vals:
Expand All @@ -888,7 +897,8 @@ def trim_insertions(read: pysam.AlignedSegment,

def expand_clip_indent(read: pysam.AlignedSegment,
truth_range: Union[Dict[str, Any], None] = None,
ins_trim: int = 0) -> Read:
ins_trim: int = 0,
counter: Optional[Counter[str]] = None) -> Read:
"""Adds PAD tokens and clips reads.
For both subreads and label:
Expand All @@ -903,13 +913,14 @@ def expand_clip_indent(read: pysam.AlignedSegment,
truth_range: truth genome alignment coordinates. If supplied, it is
assumed this is the label alignment.
ins_trim: insertions in the read are trimmed if true.
counter: A counter object for tracking processing metrics.
Returns:
ExpandedRead
"""
# Trim insertions
if ins_trim > 0:
read = trim_insertions(read, ins_trim)
read = trim_insertions(read, ins_trim, counter)

# Extract read and reference indices.
aligned_pairs = read.get_aligned_pairs()
Expand Down Expand Up @@ -1060,9 +1071,12 @@ def create_proc_feeder(subreads_to_ccs: str,
def proc_feeder():
for read_set in subread_grouper:
main_counter['n_zmw_processed'] += 1
subreads = list(
map(expand_clip_indent, read_set, [None] * len(read_set),
[ins_trim] * len(read_set)))
expand_clip_indent_count = functools.partial(
expand_clip_indent,
truth_range=None,
ins_trim=ins_trim,
counter=main_counter)
subreads = list(map(expand_clip_indent_count, read_set))
ccs_seqname = '/'.join(subreads[0].name.split('/')[:2] + ['ccs'])
# Fetch ccs sequence and append to subread set.
while True:
Expand Down
34 changes: 24 additions & 10 deletions deepconsensus/preprocess/pre_lib_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,8 +424,9 @@ class TestTrimInsertions(parameterized.TestCase):
expected_ip=[1, 2, 3, 4, 11, 12, 13, 14],
# pw=10 is trimmed
expected_pw=[1, 2, 3, 4, 11, 12, 13, 14],
expected_strand=False # is_reversed
),
expected_strand=False, # is_reversed
expected_zmw_trimmed_insertions=1,
expected_zmw_trimmed_insertions_bp=6),
dict(
testcase_name='insertion_reversed',
segment_args={
Expand All @@ -443,8 +444,9 @@ class TestTrimInsertions(parameterized.TestCase):
expected_ip=[1, 2, 3, 4, 11],
# pw=5 is trimmed
expected_pw=[1, 2, 3, 4, 11],
expected_strand=True # is_reversed
),
expected_strand=True, # is_reversed
expected_zmw_trimmed_insertions=1,
expected_zmw_trimmed_insertions_bp=6),
dict(
testcase_name='insertion_no_trim',
segment_args={
Expand All @@ -462,8 +464,9 @@ class TestTrimInsertions(parameterized.TestCase):
expected_ip=[1] * 14,
expected_pw=[2] * 14,
expected_strand=False, # is_reversed
trim_ins=0,
),
ins_trim=0,
expected_zmw_trimmed_insertions=0,
expected_zmw_trimmed_insertions_bp=0),
dict(
testcase_name='deletion',
segment_args={
Expand All @@ -482,8 +485,9 @@ class TestTrimInsertions(parameterized.TestCase):
],
expected_ip=[1] * 8,
expected_pw=[2] * 8,
expected_strand=False # is_reversed
))
expected_strand=False, # is_reversed
expected_zmw_trimmed_insertions=0,
expected_zmw_trimmed_insertions_bp=0))
def test_trim_insertions(self,
segment_args,
expected_bases,
Expand All @@ -492,9 +496,13 @@ def test_trim_insertions(self,
expected_ip=None,
expected_pw=None,
expected_strand=None,
trim_ins=5):
expected_zmw_trimmed_insertions=None,
expected_zmw_trimmed_insertions_bp=None,
ins_trim=5):
segment = create_segment(**segment_args)
trimmed_segment = pre_lib.trim_insertions(segment, trim_ins)
counter = collections.Counter()
trimmed_segment = pre_lib.trim_insertions(
read=segment, ins_trim=ins_trim, counter=counter)
self.assertEqual(trimmed_segment.query_sequence, expected_bases)
aligned_pairs = trimmed_segment.get_aligned_pairs()
self.assertListEqual(trimmed_segment.cigartuples, expected_cigar)
Expand All @@ -507,6 +515,12 @@ def test_trim_insertions(self,
self.assertListEqual(list(trimmed_segment.get_tag('pw')), expected_pw)
if expected_strand:
self.assertEqual(expected_strand, trimmed_segment.is_reverse)
if expected_zmw_trimmed_insertions:
self.assertEqual(counter['zmw_trimmed_insertions'],
expected_zmw_trimmed_insertions)
if expected_zmw_trimmed_insertions_bp:
self.assertEqual(counter['zmw_trimmed_insertions_bp'],
expected_zmw_trimmed_insertions_bp)


class TestCcsRead(absltest.TestCase):
Expand Down

0 comments on commit 3a8bd9a

Please sign in to comment.