Merge pull request #14 from HadrienG/abundance

Abundance distributions
HadrienG · Jul 18, 2017 · a0aa58e · a0aa58e
2 parents 3c631c7 + daab13b
commit a0aa58e
Show file tree

Hide file tree

Showing 6 changed files with 208 additions and 20 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ __pycache__/
 
 # Distribution / packaging
 *.egg-info/
+dist/
 
 # Unit test / coverage reports
 .cache

diff --git a/iss/abundance.py b/iss/abundance.py
@@ -6,6 +6,9 @@
 import os
 import sys
 import logging
+import numpy as np
+
+from scipy import stats
 
 
 def parse_abundance_file(abundance_file):
@@ -45,6 +48,104 @@ def parse_abundance_file(abundance_file):
     return abundance_dic
 
 
+def uniform(record_list):
+    """Generate uniform abundance distribution from a number of records
+
+    Args:
+        record_list (list): a list of record.id
+
+    Returns:
+        list: a list of floats
+    """
+    abundance_dic = {}
+    n_records = len(record_list)
+    for record in record_list:
+        abundance_dic[record] = 1 / n_records
+
+    return abundance_dic
+
+
+def halfnormal(record_list):
+    """Generate scaled halfnormal abundance distribution from a number of
+        records
+
+    Args:
+        record_list (list): a list of record.id
+
+    Returns:
+        list: a list of floats
+    """
+    abundance_dic = {}
+    n_records = len(record_list)
+    dist = stats.halfnorm.rvs(loc=0.00, scale=1.00, size=n_records)
+    dist_scaled = dist / sum(dist)
+    for record, abundance in zip(record_list, dist_scaled):
+        abundance_dic[record] = abundance
+
+    return abundance_dic
+
+
+def exponential(record_list):
+    """Generate scaled exponential abundance distribution from a number of
+        records
+
+    Args:
+        record_list (list): a list of record.id
+
+    Returns:
+        list: a list of floats
+    """
+    abundance_dic = {}
+    n_records = len(record_list)
+    dist = np.random.exponential(size=n_records)
+    dist_scaled = dist / sum(dist)
+    for record, abundance in zip(record_list, dist_scaled):
+        abundance_dic[record] = abundance
+
+    return abundance_dic
+
+
+def lognormal(record_list):
+    """Generate scaled lognormal abundance distribution from a number of
+        records
+
+    Args:
+        record_list (list): a list of record.id
+
+    Returns:
+        list: a list of floats
+    """
+    abundance_dic = {}
+    n_records = len(record_list)
+    dist = np.random.lognormal(size=n_records)
+    dist_scaled = dist / sum(dist)
+    for record, abundance in zip(record_list, dist_scaled):
+        abundance_dic[record] = abundance
+
+    return abundance_dic
+
+
+def zero_inflated_lognormal(record_list):
+    """Generate scaled zero-inflated lognormal abundance distribution from a
+        number of records
+
+    Args:
+        record_list (list): a list of record.id
+
+    Returns:
+        list: a list of floats
+    """
+    abundance_dic = {}
+    n_records = len(record_list)
+    zero_inflated = stats.bernoulli.rvs(p=0.2, size=n_records)
+    dist = (1 - zero_inflated) * np.random.lognormal(size=n_records)
+    dist_scaled = dist / sum(dist)
+    for record, abundance in zip(record_list, dist_scaled):
+        abundance_dic[record] = abundance
+
+    return abundance_dic
+
+
 def to_coverage(total_n_reads, species_abundance, read_length, genome_size):
     """Calculate the coverage of a genome in a metagenome given its size and
     abundance

diff --git a/iss/app.py b/iss/app.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 
 from iss import bam
+from iss import util
 from iss import abundance
 from iss import generator
 from Bio import SeqIO
@@ -51,25 +52,47 @@ def generate_reads(args):
         logger.error('Failed to import ErrorModel module: %s' % e)
         sys.exit(1)
 
-    # read the abundance file
-    abundance_dic = abundance.parse_abundance_file(args.abundance)
-
     try:  # try to read genomes and generate reads
         assert os.stat(args.genomes).st_size != 0
         f = open(args.genomes, 'r')
+        with f:  # count the number of records
+            record_list = util.count_records(f)
+
     except IOError as e:
         logger.error('Failed to open genome(s) file:%s' % e)
         sys.exit(1)
     except AssertionError as e:
         logger.error('Genome(s) file seems empty: %s' % args.genomes)
         sys.exit(1)
     else:
+        # read the abundance file
+        if args.abundance_file:
+            logger.info('Using abundance file:%s' % args.abundance_file)
+            abundance_dic = abundance.parse_abundance_file(args.abundance_file)
+        elif args.abundance == 'uniform':
+            logger.info('Using %s abundance distribution' % args.abundance)
+            abundance_dic = abundance.uniform(record_list)
+        elif args.abundance == 'halfnormal':
+            logger.info('Using %s abundance distribution' % args.abundance)
+            abundance_dic = abundance.halfnormal(record_list)
+        elif args.abundance == 'exponential':
+            logger.info('Using %s abundance distribution' % args.abundance)
+            abundance_dic = abundance.exponential(record_list)
+        elif args.abundance == 'lognormal':
+            logger.info('Using %s abundance distribution' % args.abundance)
+            abundance_dic = abundance.lognormal(record_list)
+        elif args.abundance == 'zero_inflated_lognormal':
+            logger.info('Using %s abundance distribution' % args.abundance)
+            abundance_dic = abundance.zero_inflated_lognormal(record_list)
+        else:
+            logger.error('Could not get abundance')
+            sys.exit(1)
+
+        f = open(args.genomes, 'r')  # re-opens the file
         with f:
             fasta_file = SeqIO.parse(f, 'fasta')
-            n_records = 0
             for record in fasta_file:  # generate set of reads for each record
                 try:
-                    n_records += 1
                     species_abundance = abundance_dic[record.id]
                 except KeyError as e:
                     logger.error(
@@ -92,16 +115,7 @@ def generate_reads(args):
                         )
 
                     generator.to_fastq(read_gen, args.output)
-
-            try:  # check if at least one record was in fasta file
-                assert n_records != 0
-            except AssertionError as e:
-                logger.error(
-                    'Failed to find records in genome(s) file:%s'
-                    % args.genomes)
-                sys.exit(1)
-            else:
-                logger.info('Read generation complete')
+        logger.info('Read generation complete')
 
 
 def model_from_bam(args):
@@ -170,15 +184,27 @@ def main():
     parser_gen.add_argument(
         '--genomes',
         '-g',
-        metavar='<fasta>',
+        metavar='<.fasta>',
         help='Input genome(s) from where the reads will originate (Required)',
         required=True
     )
     parser_gen.add_argument(
         '--abundance',
         '-a',
-        metavar='<txt>',
-        help='abundance file for coverage calculations (default: %(default)s)'
+        choices=['uniform', 'halfnormal',
+                 'exponential', 'lognormal', 'zero_inflated_lognormal'],
+        metavar='<str>',
+        default='lognormal',
+        help='abundance distribution (default: %(default)s). Can be uniform,\
+            halfnormal, exponential, lognormal or zero-inflated-lognormal.'
+    )
+    parser_gen.add_argument(
+        '--abundance_file',
+        '-b',
+        metavar='<.txt>',
+        help='abundance file for coverage calculations (default: %(default)s).\
+        If both --abundance and --abundance_file, the abundance file will be\
+        used instead of the distribution.'
     )
     parser_gen.add_argument(
         '--n_reads',
@@ -191,11 +217,11 @@ def main():
     parser_gen.add_argument(
         '--model',
         '-m',
-        metavar='[\'cdf\', \'kde\', \'basic\']',
+        metavar='<str>',
         choices=['cdf', 'kde', 'basic'],
         default='kde',
         help='Error model. If not specified, using kernel density estimation \
-        (default: %(default)s). Can be \'kde\', \'cdf\' or \'basic\''
+        (default: %(default)s). Can be kde, cdf or basic.'
     )
     parser_gen.add_argument(
         '--model_file',

diff --git a/iss/test/test_abundance.py b/iss/test/test_abundance.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+from iss import util
 from iss import abundance
 
+import numpy as np
+
 
 def test_parsing():
     abundance_dic = abundance.parse_abundance_file('data/abundance.txt')
@@ -23,3 +26,25 @@ def test_cov_calc():
         4639221
         )
     assert round(coverage_ecoli, 3) == 25.866
+
+
+def test_distributions():
+    np.random.seed(42)
+    f = open('data/genomes.fasta', 'r')
+    with f:  # count the number of records
+        record_list = util.count_records(f)
+
+    uniform_dic = abundance.uniform(record_list)
+    halfnormal_dic = abundance.halfnormal(record_list)
+    exponential_dic = abundance.exponential(record_list)
+    lognormal_dic = abundance.lognormal(record_list)
+
+    np.random.seed(42)  # reset the seed to get 0s in zero_inflated_lognormal
+    zero_inflated_lognormal_dic = abundance.zero_inflated_lognormal(
+        record_list)
+    assert list(uniform_dic.values()) == [0.2] * 5
+    assert round(halfnormal_dic['genome_A'], 2) == 0.16
+    assert round(exponential_dic['genome_A'], 2) == 0.01
+    assert round(lognormal_dic['genome_T'], 2) == 0.19
+    assert zero_inflated_lognormal_dic['genome_T'] == 0.0
+    assert round(zero_inflated_lognormal_dic['genome_A'], 2) == 0.44
diff --git a/iss/test/test_util.py b/iss/test/test_util.py
@@ -21,3 +21,11 @@ def test_rev_comp():
     uppercase_seq = 'CCGATTAC'
     assert util.rev_comp(lowercase_seq) == 'atagcaat'
     assert util.rev_comp(uppercase_seq) == 'GTAATCGG'
+
+
+def test_count_records():
+    f = open('data/genomes.fasta', 'r')
+    with f:  # count the number of records
+        record_list = util.count_records(f)
+    assert record_list == [
+        'genome_A', 'genome_T', 'genome_GC', 'genome_ATCG', 'genome_TA']
diff --git a/iss/util.py b/iss/util.py
@@ -4,6 +4,9 @@
 from __future__ import division
 from builtins import dict
 
+from Bio import SeqIO
+
+import logging
 import numpy as np
 
 
@@ -57,3 +60,27 @@ def rev_comp(s):
     complement = "".join([bases[b] for b in sequence])
     reverse_complement = complement[::-1]
     return reverse_complement
+
+
+def count_records(fasta_file):
+    """Count the number of records in a fasta file and return a list of
+    recods id
+
+    Args:
+        fasta_file (string): the path to a fasta file
+
+    Returns:
+        list: a list of record ids
+    """
+    logger = logging.getLogger(__name__)
+    record_list = []
+    for record in SeqIO.parse(fasta_file, "fasta"):
+        record_list.append(record.id)
+    try:
+        assert len(record_list) != 0
+    except AssertionError as e:
+        logger.error(
+            'Failed to find records in genome(s) file:%s' % fasta_file)
+        sys.exit(1)
+    else:
+        return record_list