deepconsensus/deepconsensus/postprocess/stitch_utils.py at r1.2 · google/deepconsensus

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

#

# Redistribution and use in source and binary forms, with or without modification,

# are permitted provided that the following conditions are met:

#

# 1. Redistributions of source code must retain the above copyright notice, this

# list of conditions and the following disclaimer.

#

# 2. Redistributions in binary form must reproduce the above copyright notice,

# this list of conditions and the following disclaimer in the documentation

# and/or other materials provided with the distribution.

#

# 3. Neither the name of Google Inc. nor the names of its contributors

# may be used to endorse or promote products derived from this software without

# specific prior written permission.

#

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR

# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""Methods for DeepConsensus stitch-predictions step."""

import dataclasses

from typing import Iterable, Optional, Tuple

from absl import logging

import numpy as np

from deepconsensus.utils import dc_constants

from deepconsensus.utils import utils

@dataclasses.dataclass

class DCModelOutput:

molecule_name: str

window_pos: int

ec: float

np_num_passes: int

rq: float

rg: str

sequence: Optional[str] = None

quality_string: Optional[str] = None

def get_full_sequence(

deepconsensus_outputs: Iterable[DCModelOutput],

max_length: int,

fill_n: bool = False,

):

"""Stitch together windows of predictions into a full sequence."""

# Build up the full sequence from the sorted windows.

full_sequence_parts = []

quality_string_parts = []

start = 0

# DeepConsensus outputs are expected to be sorted.

for dc_output in deepconsensus_outputs:

# This while loop is used to handle missing windows

while dc_output.window_pos > start:

if not fill_n:

return None, ''

else:

# Add N-base filler for sequences that were unable to be inferred.

full_sequence_parts.append('N' * max_length)

empty_quality_scores = np.array([dc_constants.EMPTY_QUAL] * max_length)

empty_quality_string = utils.quality_scores_to_string(

empty_quality_scores

)

quality_string_parts.append(empty_quality_string)

start += max_length

full_sequence_parts.append(dc_output.sequence)

quality_string_parts.append(dc_output.quality_string)

start += max_length

full_sequence = ''.join(full_sequence_parts)

full_quality_string = ''.join(quality_string_parts)

return full_sequence, full_quality_string

def remove_gaps(sequence: str, quality_string: str) -> Tuple[str, str]:

"""Removes gaps and corresponding quality score from outputs."""

# Remove gaps from the final sequence.

final_sequence = ''

final_quality_string = ''

bases_to_remove = set([dc_constants.GAP])

# Only keep bases and quality scores for non gap positions.

for base, quality in zip(sequence, quality_string):

if base not in bases_to_remove:

final_sequence += base

final_quality_string += quality

assert len(final_sequence) == len(final_quality_string)

assert dc_constants.GAP not in final_sequence

return final_sequence, final_quality_string

def is_quality_above_threshold(quality_string, min_quality):

quality_score_array = utils.quality_string_to_array(quality_string)

# Round the phred score to ensure expected behavior. Without rounding, a

# read with all base qualities equal to 10 will have an average phred of

# 9.99999 due to python floating point precision. Such as read would get

# filtered out if min_quality is 10.

rounded_avg_phred = round(utils.avg_phred(quality_score_array), 5)

logging.vlog(3, 'Quality is %d', rounded_avg_phred)

return rounded_avg_phred >= min_quality

def format_as_fastq(

molecule_name: str, sequence: str, quality_string: str

) -> str:

formatted_for_fastq = f'@{molecule_name}\n'

formatted_for_fastq += f'{sequence}\n'

formatted_for_fastq += '+\n'

formatted_for_fastq += f'{quality_string}\n'

return formatted_for_fastq

@dataclasses.dataclass

class OutcomeCounter:

empty_sequence: int = 0

only_gaps: int = 0

failed_quality_filter: int = 0

failed_length_filter: int = 0

success: int = 0

def stitch_to_fastq(

molecule_name: str,

predictions: Iterable[DCModelOutput],

max_length: int,

min_quality: int,

min_length: int,

outcome_counter: OutcomeCounter,

) -> Optional[str]:

"""Stitch windows of predictions together, filter, and make FASTQ string."""

full_sequence, full_quality_string = get_full_sequence(

deepconsensus_outputs=predictions, max_length=max_length

)

# Filter out the read if it is empty after stitching.

if not full_sequence:

outcome_counter.empty_sequence += 1

logging.vlog(

1, 'Filtered out read that was empty after stitching: %s', molecule_name

)

return None

final_sequence, final_quality_string = remove_gaps(

sequence=full_sequence, quality_string=full_quality_string

)

# Filter out the read if it contains only gaps and no bases.

if not final_sequence:

outcome_counter.only_gaps += 1

logging.vlog(

1,

'Filtered out read that contained only gaps and no bases: %s',

molecule_name,

)

return None

# Filter out the read if its quality scores are too low.

if not is_quality_above_threshold(

quality_string=final_quality_string, min_quality=min_quality

):

outcome_counter.failed_quality_filter += 1

logging.vlog(

1, 'Filtered out read below quality threshold: %s', molecule_name

)

return None

# Filter out the read if it is too short.

if len(final_sequence) < min_length:

outcome_counter.failed_length_filter += 1

logging.vlog(

1, 'Filtered out read below length threshold: %s', molecule_name

)

return None

fastq = format_as_fastq(

molecule_name=molecule_name,

sequence=final_sequence,

quality_string=final_quality_string,

)

outcome_counter.success += 1

return fastq

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

stitch_utils.py

stitch_utils.py

Files

stitch_utils.py

Latest commit

History

stitch_utils.py

File metadata and controls