Changes for Issue #442, #520, #528

Massive mega-change updating AlleleParsers to extract AN, AC, HOM from population frequency sources. Add new Gnomad3 and Gnomad4 AlleleParsers Add new DirectoryArchive to enable parsing multiple VCF files in a single directory, e.g. for gnomad-v3 and gnomad-v4 Remove dbsnp, exac and esp data sources as these are now merged into gnomad (dbsnp was used as the source of the 1K genomes and TOPMed data). Update hg19 to use gnomad-v2.1 Update hg38 to use gnomad-v4 (#528) Update path sources to use dbNSFP v4.5a and include AlphMissense (#520). Remove M-CAP, MPC and PrimateAI
exomiser · Jan 25, 2024 · 8099fa0 · 8099fa0
1 parent b2c2ad8
commit 8099fa0
Show file tree

Hide file tree

Showing 58 changed files with 2,478 additions and 886 deletions.
diff --git a/exomiser-cli/src/test/java/org/monarchinitiative/exomiser/cli/CommandLineJobReaderTest.java b/exomiser-cli/src/test/java/org/monarchinitiative/exomiser/cli/CommandLineJobReaderTest.java
@@ -231,36 +231,38 @@ class CommandLineJobReaderTest {
                             .build()
             ))
             .frequencySources(ImmutableSet.of(
-                    FrequencySource.THOUSAND_GENOMES,
-                    FrequencySource.TOPMED,
+                    // commented out values are legacy/ founder populations and not required.
+                    // Why keep them? I don't know, for historical purposes perhaps?
+//                    FrequencySource.THOUSAND_GENOMES,
+//                    FrequencySource.TOPMED,
                     FrequencySource.UK10K,
 
-                    FrequencySource.ESP_AFRICAN_AMERICAN,
-                    FrequencySource.ESP_EUROPEAN_AMERICAN,
-                    FrequencySource.ESP_ALL,
+//                    FrequencySource.ESP_AA,
+//                    FrequencySource.ESP_EA,
+//                    FrequencySource.ESP_ALL,
 
-                    FrequencySource.EXAC_AFRICAN_INC_AFRICAN_AMERICAN,
-                    FrequencySource.EXAC_AMERICAN,
-                    FrequencySource.EXAC_SOUTH_ASIAN,
-                    FrequencySource.EXAC_EAST_ASIAN,
-                    FrequencySource.EXAC_FINNISH,
-                    FrequencySource.EXAC_NON_FINNISH_EUROPEAN,
-                    FrequencySource.EXAC_OTHER,
+//                    FrequencySource.EXAC_AFRICAN_INC_AFRICAN_AMERICAN,
+//                    FrequencySource.EXAC_AMERICAN,
+//                    FrequencySource.EXAC_SOUTH_ASIAN,
+//                    FrequencySource.EXAC_EAST_ASIAN,
+//                    FrequencySource.EXAC_FINNISH,
+//                    FrequencySource.EXAC_NON_FINNISH_EUROPEAN,
+//                    FrequencySource.EXAC_OTHER,
 
                     FrequencySource.GNOMAD_E_AFR,
                     FrequencySource.GNOMAD_E_AMR,
                     FrequencySource.GNOMAD_E_EAS,
-                    FrequencySource.GNOMAD_E_FIN,
+//                    FrequencySource.GNOMAD_E_FIN,
                     FrequencySource.GNOMAD_E_NFE,
-                    FrequencySource.GNOMAD_E_OTH,
+//                    FrequencySource.GNOMAD_E_OTH,
                     FrequencySource.GNOMAD_E_SAS,
 
                     FrequencySource.GNOMAD_G_AFR,
                     FrequencySource.GNOMAD_G_AMR,
                     FrequencySource.GNOMAD_G_EAS,
-                    FrequencySource.GNOMAD_G_FIN,
+//                    FrequencySource.GNOMAD_G_FIN,
                     FrequencySource.GNOMAD_G_NFE,
-                    FrequencySource.GNOMAD_G_OTH,
+//                    FrequencySource.GNOMAD_G_OTH,
                     FrequencySource.GNOMAD_G_SAS
             ))
             .pathogenicitySources(ImmutableSet.of(REVEL, MVP))

diff --git a/...er-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/BuildCommand.java b/...er-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/BuildCommand.java
@@ -74,7 +74,7 @@ public class BuildCommand implements Callable<Integer> {
     private Path buildClinVar;
     @Option(names = "--transcripts", converter = TranscriptSourceConverter.class, split = ",", arity = "0..1", fallbackValue = "ensembl,refseq,ucsc", description = "List of transcript databases to build. If specified without parameter, will build all sources: ${FALLBACK-VALUE}")
     private List<TranscriptSource> transcriptSources;
-    @Option(names = "--variants", split = ",", arity = "0..1", fallbackValue = "esp,exac,uk10k,topmed,dbsnp,gnomad-exome,gnomad-genome,dbnsfp", description = "List of variant data sources to build. If specified without parameter, will build all sources: ${FALLBACK-VALUE}")
+    @Option(names = "--variants", split = ",", arity = "0..1", fallbackValue = "esp,exac,uk10k,topmed,dbsnp,gnomad-exome,gnomad-genome,gnomad-mito,alfa,dbnsfp", description = "List of variant data sources to build. If specified without parameter, will build all sources: ${FALLBACK-VALUE}")
     private List<String> variantSources;
     @Option(names = "--genome", description = "Flag to trigger building of genome data.")
     private boolean buildGenome;

diff --git a/...e/src/main/java/org/monarchinitiative/exomiser/data/genome/GenomeDatabaseBuildRunner.java b/...e/src/main/java/org/monarchinitiative/exomiser/data/genome/GenomeDatabaseBuildRunner.java
@@ -43,6 +43,9 @@
 import java.net.URLEncoder;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
+import java.sql.CallableStatement;
+import java.sql.Connection;
+import java.sql.SQLException;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -104,16 +107,20 @@ public void run() {
         // e.g. V1.3.1__insert_sv_path
         // truncate and drop original resource tables
         // TODO: ADD gnomAD pLOF, pLI, HI, triplosensitivity, genic intolerance, gene constraint scores.
-        //                    // TODO: do we want to do this? Don't we want each resource to handle its own sources, parsing and writing?
-        //                    //  e.g. dbVar has 3-4 nr_ source files, clinVar has one file for both assemblies and only the correct
-        //                    //  lines should be converted and written.
-
 
         //build genome.h2.db
         Path databasePath = outputPath.resolve(String.format("%s_genome", buildInfo.getBuildString()));
         DataSource dataSource = createDataSource(databasePath);
         logger.info("Created database: {}", databasePath);
         migrateDatabase(dataSource);
+        // try compacting the database once everything is loaded to reduce size on disk
+        try(Connection connection = dataSource.getConnection();
+            CallableStatement statement = connection.prepareCall("SHUTDOWN COMPACT;")) {
+            logger.info("Shutting down and compacting genome database");
+            statement.execute();
+        } catch (SQLException e) {
+            logger.error("", e);
+        }
         logger.info("Finished importing genome data");
     }
 
@@ -165,7 +172,7 @@ private void downloadResource(URL source, Path destination) {
     }
 
     private DataSource createDataSource(Path databasePath) {
-        String initSql = "MODE=PostgreSQL;LOG=0;CACHE_SIZE=65536;LOCK_MODE=0;UNDO_LOG=0;MV_STORE=FALSE;";
+        String initSql = "MODE=PostgreSQL;LOG=0;CACHE_SIZE=65536;LOCK_MODE=0;UNDO_LOG=0;";
         String url = String.format("jdbc:h2:file:%s;%s", databasePath.toAbsolutePath(), initSql);
         return DataSourceBuilder.create()
                 .type(HikariDataSource.class)

diff --git a/.../src/main/java/org/monarchinitiative/exomiser/data/genome/VariantDatabaseBuildRunner.java b/.../src/main/java/org/monarchinitiative/exomiser/data/genome/VariantDatabaseBuildRunner.java
@@ -74,9 +74,6 @@ public void run() {
             throw new IllegalStateException("Error writing to MVStore " + fileName, e);
         }
 
-        MVMap<AlleleKey, AlleleProperties> alleleMVMap = MvStoreUtil.openAlleleMVMap(mvStore);
-        logger.info("Written {} alleles to store", alleleMVMap.size());
-
         // super-important step for producing as small a store as possible, Could double (or more?) when this is in progress
         logger.info("Compacting store...");
         MVStoreTool.compact(fileName, true);

diff --git a/...ta-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/config/Hg19Config.java b/...ta-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/config/Hg19Config.java
@@ -78,10 +78,11 @@ public Map<String, AlleleResource> hg19AlleleResources() {
 
         alleleResources.put("gnomad-genome", gnomadGenomeAlleleResource());
         alleleResources.put("gnomad-exome", gnomadExomeAlleleResource());
-        // TOPMed removed as this is now part of dbSNP
-        alleleResources.put("dbsnp", dbSnpAlleleResource());
+        alleleResources.put("gnomad-mito", gnomadMitoAlleleResource());
+        // TOPMed removed as this is now part of gnomAD v2.1
+        // dbSNP removed as this mostly adds a lot of empty data with only rsids
         alleleResources.put("uk10k", uk10kAlleleResource());
-        alleleResources.put("exac", exacAlleleResource());
+        // ExAC removed as this is part of gnomad-exomes
         alleleResources.put("esp", espAlleleResource());
         alleleResources.put("dbnsfp", dbnsfpAlleleResource());
         // CLinVar removed - now handled as a separate data source
@@ -132,6 +133,10 @@ public GnomadExomeAlleleResource gnomadExomeAlleleResource() {
         return alleleResource(GnomadExomeAlleleResource.class, "hg19.gnomad-exome");
     }
 
+    public Gnomad3MitoAlleleResource gnomadMitoAlleleResource() {
+        return alleleResource(Gnomad3MitoAlleleResource.class, "hg19.gnomad-mito");
+    }
+
     private List<SvResource> hg19SvResources(Path genomeProcessPath) {
         return List.of(
                 clinvarSvResource(genomeProcessPath),

diff --git a/...ta-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/config/Hg38Config.java b/...ta-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/config/Hg38Config.java
@@ -79,13 +79,16 @@ public Path genomeProcessPath() {
     public Map<String, AlleleResource> hg38AlleleResources() {
         ImmutableMap.Builder<String, AlleleResource> alleleResources = new ImmutableMap.Builder<>();
 
+        // thousand genomes removed as this is part of gnomAD v2.1
         alleleResources.put("gnomad-genome", gnomadGenomeAlleleResource());
         alleleResources.put("gnomad-exome", gnomadExomeAlleleResource());
-        // TOPMed removed as this is now part of dbSNP
-        alleleResources.put("dbsnp", dbSnpAlleleResource());
+        alleleResources.put("gnomad-mito", gnomadMitoAlleleResource());
+        alleleResources.put("alfa", alfaAlleleResource());
+        // TOPMed removed as this is part of gnomAD v2.1
+        // dbSNP removed as this mostly adds a lot of empty data with only rsids
         alleleResources.put("uk10k", uk10kAlleleResource());
-        alleleResources.put("exac", exacAlleleResource());
-        alleleResources.put("esp", espAlleleResource());
+        // ExAC removed as this is part of gnomad-exomes v2.1
+        // ESP removed as this is part of gnomad-exomes v4
         alleleResources.put("dbnsfp", dbnsfpAlleleResource());
         // CLinVar removed - now handled as a separate data source
 
@@ -127,21 +130,30 @@ public Uk10kAlleleResource uk10kAlleleResource() {
         return alleleResource(Uk10kAlleleResource.class, "hg38.uk10k");
     }
 
-    public GnomadGenomeAlleleResource gnomadGenomeAlleleResource() {
-        return alleleResource(GnomadGenomeAlleleResource.class, "hg38.gnomad-genome");
+    public Gnomad4GenomeAlleleResource gnomadGenomeAlleleResource() {
+        return alleleResource(Gnomad4GenomeAlleleResource.class, "hg38.gnomad-genome");
     }
 
-    public GnomadExomeAlleleResource gnomadExomeAlleleResource() {
-        return alleleResource(GnomadExomeAlleleResource.class, "hg38.gnomad-exome");
+    public Gnomad4ExomeAlleleResource gnomadExomeAlleleResource() {
+        return alleleResource(Gnomad4ExomeAlleleResource.class, "hg38.gnomad-exome");
+    }
+
+    public Gnomad3MitoAlleleResource gnomadMitoAlleleResource() {
+        return alleleResource(Gnomad3MitoAlleleResource.class, "hg38.gnomad-mito");
+    }
+
+    public AlfaAlleleResource alfaAlleleResource() {
+        return alleleResource(AlfaAlleleResource.class, "hg38.alfa");
     }
 
     public List<SvResource> hg38SvResources(Path genomeProcessPath) {
+        // GgnomAD hg38 is part of dbVar, GoNL is hg19 only
         return List.of(
                 clinvarSvResource(genomeProcessPath),
                 dbVarFrequencyResource(genomeProcessPath),
                 dgvSvResource(genomeProcessPath),
                 decipherSvResource(genomeProcessPath),
-                gonlSvFrequencyResource(genomeProcessPath),
+//                gonlSvFrequencyResource(genomeProcessPath),
                 gnomadSvFrequencyResource(genomeProcessPath)
         );
     }

diff --git a/...me/src/main/java/org/monarchinitiative/exomiser/data/genome/indexers/AlleleConverter.java b/...me/src/main/java/org/monarchinitiative/exomiser/data/genome/indexers/AlleleConverter.java
@@ -28,14 +28,21 @@
 import org.monarchinitiative.exomiser.core.proto.AlleleProto.ClinVar;
 import org.monarchinitiative.exomiser.data.genome.model.Allele;
 import org.monarchinitiative.exomiser.data.genome.model.AlleleProperty;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
+import java.util.Collection;
+import java.util.EnumMap;
+import java.util.List;
 import java.util.Map;
 
 /**
  * @author Jules Jacobsen <j.jacobsen@qmul.ac.uk>
  */
 public class AlleleConverter {
 
+    private static final Logger logger = LoggerFactory.getLogger(AlleleConverter.class);
+
     private AlleleConverter() {
         //static utility class
     }
@@ -50,24 +57,77 @@ public static AlleleKey toAlleleKey(Allele allele) {
     }
 
     public static AlleleProperties mergeProperties(AlleleProperties originalProperties, AlleleProperties properties) {
-        String updatedRsId = (originalProperties.getRsId()
-                .isEmpty()) ? properties.getRsId() : originalProperties.getRsId();
-        return AlleleProperties.newBuilder()
-                .mergeFrom(originalProperties)
-                .mergeFrom(properties)
-                //original rsid would have been overwritten by the new one - we don't necessarily want that, so re-set it now.
+        if (originalProperties.equals(properties)) {
+            return originalProperties;
+        }
+        logger.debug("Merging {} with {}", originalProperties, properties);
+        //original rsid would have been overwritten by the new one - we don't necessarily want that, so re-set it now.
+        String updatedRsId = originalProperties.getRsId().isEmpty() ? properties.getRsId() : originalProperties.getRsId();
+
+        // unfortunately since changing from a map to a list-based representation of the frequencies and pathogenicity scores
+        // this is more manual than simply calling .mergeFrom(originalProperties) / .mergeFrom(properties) as these will
+        // append the lists resulting in possible duplicates, hence we're merging them manually to avoid duplicates and
+        // overwrite any existing values with the newer version for that source.
+        Collection<AlleleProto.Frequency> mergedFrequencies = mergeFrequencies(originalProperties.getFrequenciesList(), properties.getFrequenciesList());
+        Collection<AlleleProto.PathogenicityScore> mergedPathScores = mergePathScores(originalProperties.getPathogenicityScoresList(), properties.getPathogenicityScoresList());
+
+        AlleleProperties.Builder mergedProperties = originalProperties.toBuilder();
+        if (!mergedProperties.hasClinVar() && properties.hasClinVar()) {
+            mergedProperties.setClinVar(properties.getClinVar());
+        }
+        return mergedProperties
+                .clearFrequencies()
+                .addAllFrequencies(mergedFrequencies)
+                .clearPathogenicityScores()
+                .addAllPathogenicityScores(mergedPathScores)
                 .setRsId(updatedRsId)
                 .build();
     }
 
+    private static Collection<AlleleProto.PathogenicityScore> mergePathScores(List<AlleleProto.PathogenicityScore> originalPathScores, List<AlleleProto.PathogenicityScore> currentPathScores) {
+        if (originalPathScores.isEmpty()) {
+            return currentPathScores;
+        }
+        Map<AlleleProto.PathogenicitySource, AlleleProto.PathogenicityScore> mergedPaths = new EnumMap<>(AlleleProto.PathogenicitySource.class);
+        mergePaths(originalPathScores, mergedPaths);
+        mergePaths(currentPathScores, mergedPaths);
+        return mergedPaths.values();
+    }
+
+    private static void mergePaths(List<AlleleProto.PathogenicityScore> originalPathScores, Map<AlleleProto.PathogenicitySource, AlleleProto.PathogenicityScore> mergedPaths) {
+        for (int i = 0; i < originalPathScores.size(); i++) {
+            var pathScore = originalPathScores.get(i);
+            mergedPaths.put(pathScore.getPathogenicitySource(), pathScore);
+        }
+    }
+
+    private static Collection<AlleleProto.Frequency> mergeFrequencies(List<AlleleProto.Frequency> originalFreqs, List<AlleleProto.Frequency> currentFreqs) {
+        if (originalFreqs.isEmpty()) {
+            return currentFreqs;
+        }
+        Map<AlleleProto.FrequencySource, AlleleProto.Frequency> mergedFreqs = new EnumMap<>(AlleleProto.FrequencySource.class);
+        mergeFreqs(originalFreqs, mergedFreqs);
+        mergeFreqs(currentFreqs, mergedFreqs);
+        return mergedFreqs.values();
+    }
+
+    private static void mergeFreqs(List<AlleleProto.Frequency> originalFreqs, Map<AlleleProto.FrequencySource, AlleleProto.Frequency> mergedFreqs) {
+        for (int i = 0; i < originalFreqs.size(); i++) {
+            var freq = originalFreqs.get(i);
+            mergedFreqs.put(freq.getFrequencySource(), freq);
+        }
+    }
+
     public static AlleleProperties toAlleleProperties(Allele allele) {
         AlleleProperties.Builder builder = AlleleProperties.newBuilder();
         builder.setRsId(allele.getRsId());
-        addAllelePropertyValues(builder, allele.getValues());
+        builder.addAllFrequencies(allele.getFrequencies());
+        builder.addAllPathogenicityScores(allele.getPathogenicityScores());
         addClinVarData(builder, allele);
         return builder.build();
     }
 
+    @Deprecated(since = "14.0.0")
     private static void addAllelePropertyValues(AlleleProperties.Builder builder, Map<AlleleProperty, Float> values) {
         for (Map.Entry<AlleleProperty, Float> entry : values.entrySet()) {
             builder.putProperties(entry.getKey().toString(), entry.getValue());

diff --git a/...ata-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/indexers/Indexer.java b/...ata-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/indexers/Indexer.java
@@ -38,4 +38,5 @@ default void index(Resource<T> resource) {
 
     void write(T type);
 
+    long count();
 }