[go: nahoru, domu]

Skip to content

Commit

Permalink
Merge pull request #14 from HadrienG/abundance
Browse files Browse the repository at this point in the history
Abundance distributions
  • Loading branch information
HadrienG committed Jul 18, 2017
2 parents 3c631c7 + daab13b commit a0aa58e
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ __pycache__/

# Distribution / packaging
*.egg-info/
dist/

# Unit test / coverage reports
.cache
Expand Down
101 changes: 101 additions & 0 deletions iss/abundance.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
import os
import sys
import logging
import numpy as np

from scipy import stats


def parse_abundance_file(abundance_file):
Expand Down Expand Up @@ -45,6 +48,104 @@ def parse_abundance_file(abundance_file):
return abundance_dic


def uniform(record_list):
"""Generate uniform abundance distribution from a number of records
Args:
record_list (list): a list of record.id
Returns:
list: a list of floats
"""
abundance_dic = {}
n_records = len(record_list)
for record in record_list:
abundance_dic[record] = 1 / n_records

return abundance_dic


def halfnormal(record_list):
"""Generate scaled halfnormal abundance distribution from a number of
records
Args:
record_list (list): a list of record.id
Returns:
list: a list of floats
"""
abundance_dic = {}
n_records = len(record_list)
dist = stats.halfnorm.rvs(loc=0.00, scale=1.00, size=n_records)
dist_scaled = dist / sum(dist)
for record, abundance in zip(record_list, dist_scaled):
abundance_dic[record] = abundance

return abundance_dic


def exponential(record_list):
"""Generate scaled exponential abundance distribution from a number of
records
Args:
record_list (list): a list of record.id
Returns:
list: a list of floats
"""
abundance_dic = {}
n_records = len(record_list)
dist = np.random.exponential(size=n_records)
dist_scaled = dist / sum(dist)
for record, abundance in zip(record_list, dist_scaled):
abundance_dic[record] = abundance

return abundance_dic


def lognormal(record_list):
"""Generate scaled lognormal abundance distribution from a number of
records
Args:
record_list (list): a list of record.id
Returns:
list: a list of floats
"""
abundance_dic = {}
n_records = len(record_list)
dist = np.random.lognormal(size=n_records)
dist_scaled = dist / sum(dist)
for record, abundance in zip(record_list, dist_scaled):
abundance_dic[record] = abundance

return abundance_dic


def zero_inflated_lognormal(record_list):
"""Generate scaled zero-inflated lognormal abundance distribution from a
number of records
Args:
record_list (list): a list of record.id
Returns:
list: a list of floats
"""
abundance_dic = {}
n_records = len(record_list)
zero_inflated = stats.bernoulli.rvs(p=0.2, size=n_records)
dist = (1 - zero_inflated) * np.random.lognormal(size=n_records)
dist_scaled = dist / sum(dist)
for record, abundance in zip(record_list, dist_scaled):
abundance_dic[record] = abundance

return abundance_dic


def to_coverage(total_n_reads, species_abundance, read_length, genome_size):
"""Calculate the coverage of a genome in a metagenome given its size and
abundance
Expand Down
66 changes: 46 additions & 20 deletions iss/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# -*- coding: utf-8 -*-

from iss import bam
from iss import util
from iss import abundance
from iss import generator
from Bio import SeqIO
Expand Down Expand Up @@ -51,25 +52,47 @@ def generate_reads(args):
logger.error('Failed to import ErrorModel module: %s' % e)
sys.exit(1)

# read the abundance file
abundance_dic = abundance.parse_abundance_file(args.abundance)

try: # try to read genomes and generate reads
assert os.stat(args.genomes).st_size != 0
f = open(args.genomes, 'r')
with f: # count the number of records
record_list = util.count_records(f)

except IOError as e:
logger.error('Failed to open genome(s) file:%s' % e)
sys.exit(1)
except AssertionError as e:
logger.error('Genome(s) file seems empty: %s' % args.genomes)
sys.exit(1)
else:
# read the abundance file
if args.abundance_file:
logger.info('Using abundance file:%s' % args.abundance_file)
abundance_dic = abundance.parse_abundance_file(args.abundance_file)
elif args.abundance == 'uniform':
logger.info('Using %s abundance distribution' % args.abundance)
abundance_dic = abundance.uniform(record_list)
elif args.abundance == 'halfnormal':
logger.info('Using %s abundance distribution' % args.abundance)
abundance_dic = abundance.halfnormal(record_list)
elif args.abundance == 'exponential':
logger.info('Using %s abundance distribution' % args.abundance)
abundance_dic = abundance.exponential(record_list)
elif args.abundance == 'lognormal':
logger.info('Using %s abundance distribution' % args.abundance)
abundance_dic = abundance.lognormal(record_list)
elif args.abundance == 'zero_inflated_lognormal':
logger.info('Using %s abundance distribution' % args.abundance)
abundance_dic = abundance.zero_inflated_lognormal(record_list)
else:
logger.error('Could not get abundance')
sys.exit(1)

f = open(args.genomes, 'r') # re-opens the file
with f:
fasta_file = SeqIO.parse(f, 'fasta')
n_records = 0
for record in fasta_file: # generate set of reads for each record
try:
n_records += 1
species_abundance = abundance_dic[record.id]
except KeyError as e:
logger.error(
Expand All @@ -92,16 +115,7 @@ def generate_reads(args):
)

generator.to_fastq(read_gen, args.output)

try: # check if at least one record was in fasta file
assert n_records != 0
except AssertionError as e:
logger.error(
'Failed to find records in genome(s) file:%s'
% args.genomes)
sys.exit(1)
else:
logger.info('Read generation complete')
logger.info('Read generation complete')


def model_from_bam(args):
Expand Down Expand Up @@ -170,15 +184,27 @@ def main():
parser_gen.add_argument(
'--genomes',
'-g',
metavar='<fasta>',
metavar='<.fasta>',
help='Input genome(s) from where the reads will originate (Required)',
required=True
)
parser_gen.add_argument(
'--abundance',
'-a',
metavar='<txt>',
help='abundance file for coverage calculations (default: %(default)s)'
choices=['uniform', 'halfnormal',
'exponential', 'lognormal', 'zero_inflated_lognormal'],
metavar='<str>',
default='lognormal',
help='abundance distribution (default: %(default)s). Can be uniform,\
halfnormal, exponential, lognormal or zero-inflated-lognormal.'
)
parser_gen.add_argument(
'--abundance_file',
'-b',
metavar='<.txt>',
help='abundance file for coverage calculations (default: %(default)s).\
If both --abundance and --abundance_file, the abundance file will be\
used instead of the distribution.'
)
parser_gen.add_argument(
'--n_reads',
Expand All @@ -191,11 +217,11 @@ def main():
parser_gen.add_argument(
'--model',
'-m',
metavar='[\'cdf\', \'kde\', \'basic\']',
metavar='<str>',
choices=['cdf', 'kde', 'basic'],
default='kde',
help='Error model. If not specified, using kernel density estimation \
(default: %(default)s). Can be \'kde\', \'cdf\' or \'basic\''
(default: %(default)s). Can be kde, cdf or basic.'
)
parser_gen.add_argument(
'--model_file',
Expand Down
25 changes: 25 additions & 0 deletions iss/test/test_abundance.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from iss import util
from iss import abundance

import numpy as np


def test_parsing():
abundance_dic = abundance.parse_abundance_file('data/abundance.txt')
Expand All @@ -23,3 +26,25 @@ def test_cov_calc():
4639221
)
assert round(coverage_ecoli, 3) == 25.866


def test_distributions():
np.random.seed(42)
f = open('data/genomes.fasta', 'r')
with f: # count the number of records
record_list = util.count_records(f)

uniform_dic = abundance.uniform(record_list)
halfnormal_dic = abundance.halfnormal(record_list)
exponential_dic = abundance.exponential(record_list)
lognormal_dic = abundance.lognormal(record_list)

np.random.seed(42) # reset the seed to get 0s in zero_inflated_lognormal
zero_inflated_lognormal_dic = abundance.zero_inflated_lognormal(
record_list)
assert list(uniform_dic.values()) == [0.2] * 5
assert round(halfnormal_dic['genome_A'], 2) == 0.16
assert round(exponential_dic['genome_A'], 2) == 0.01
assert round(lognormal_dic['genome_T'], 2) == 0.19
assert zero_inflated_lognormal_dic['genome_T'] == 0.0
assert round(zero_inflated_lognormal_dic['genome_A'], 2) == 0.44
8 changes: 8 additions & 0 deletions iss/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,11 @@ def test_rev_comp():
uppercase_seq = 'CCGATTAC'
assert util.rev_comp(lowercase_seq) == 'atagcaat'
assert util.rev_comp(uppercase_seq) == 'GTAATCGG'


def test_count_records():
f = open('data/genomes.fasta', 'r')
with f: # count the number of records
record_list = util.count_records(f)
assert record_list == [
'genome_A', 'genome_T', 'genome_GC', 'genome_ATCG', 'genome_TA']
27 changes: 27 additions & 0 deletions iss/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from __future__ import division
from builtins import dict

from Bio import SeqIO

import logging
import numpy as np


Expand Down Expand Up @@ -57,3 +60,27 @@ def rev_comp(s):
complement = "".join([bases[b] for b in sequence])
reverse_complement = complement[::-1]
return reverse_complement


def count_records(fasta_file):
"""Count the number of records in a fasta file and return a list of
recods id
Args:
fasta_file (string): the path to a fasta file
Returns:
list: a list of record ids
"""
logger = logging.getLogger(__name__)
record_list = []
for record in SeqIO.parse(fasta_file, "fasta"):
record_list.append(record.id)
try:
assert len(record_list) != 0
except AssertionError as e:
logger.error(
'Failed to find records in genome(s) file:%s' % fasta_file)
sys.exit(1)
else:
return record_list

0 comments on commit a0aa58e

Please sign in to comment.