broadinstitute · samuelklee · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/.github/workflows/gatk-tests.yml b/.github/workflows/gatk-tests.yml
@@ -291,7 +291,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_CNN_WDL', 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' ]
+        wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' ]
     continue-on-error: true
     name: WDL test ${{ matrix.wdlTest }} on cromwell
     steps:

diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.2.0
+ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.3.0
 
 # stage 1 for constructing the GATK zip
 FROM ${BASE_DOCKER} AS gradleBuild
@@ -85,7 +85,7 @@ ENV CLASSPATH=/gatk/gatk.jar:$CLASSPATH PATH=$CONDA_PATH/envs/gatk/bin:$CONDA_PA
 
 # Start GATK Python environment
 
-RUN conda env create -n gatk -f /gatk/gatkcondaenv.yml && \
+RUN conda env create -vv -n gatk -f /gatk/gatkcondaenv.yml && \
     echo "source activate gatk" >> /gatk/gatkenv.rc && \
     echo "source /gatk/gatk-completion.sh" >> /gatk/gatkenv.rc && \
     conda clean -afy && \

diff --git a/build.gradle b/build.gradle
@@ -684,7 +684,7 @@ task localDevCondaEnv(type: Exec) {
     dependsOn 'condaEnvironmentDefinition'
     inputs.file("$buildDir/$pythonPackageArchiveName")
     workingDir "$buildDir"
-    commandLine "conda", "env", "create", "--force", "-f", gatkCondaYML
+    commandLine "conda", "env", "create", "--yes", "-f", gatkCondaYML
 }
 
 task javadocJar(type: Jar, dependsOn: javadoc) {

diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_scattered_workflow.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_case_scattered_workflow.json
@@ -17,8 +17,8 @@
   "CNVGermlineCaseScatteredWorkflow.gcnv_max_training_epochs": 1,
   "CNVGermlineCaseScatteredWorkflow.gcnv_min_training_epochs": 1,
   "CNVGermlineCaseScatteredWorkflow.gcnv_model_tars": [
-    "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-0.tar.gz",
-    "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-1.tar.gz"],
+    "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-shard-0.tar.gz",
+    "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-shard-1.tar.gz"],
   "CNVGermlineCaseScatteredWorkflow.gcnv_num_thermal_advi_iters": 1,
   "CNVGermlineCaseScatteredWorkflow.intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list",
   "CNVGermlineCaseScatteredWorkflow.filtered_intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.preprocessed.filtered.interval_list",

diff --git a/scripts/docker/gatkbase/Dockerfile b/scripts/docker/gatkbase/Dockerfile
@@ -30,6 +30,7 @@ RUN apt update && \
     git \
     gpg-agent \
     build-essential \
+    libblas-dev \
     openjdk-17-jdk \
     vim \
     software-properties-common && \

diff --git a/scripts/gatkcondaenv.yml.template b/scripts/gatkcondaenv.yml.template
@@ -15,53 +15,41 @@
 name: $condaEnvName
 channels:
 # if channels other than conda-forge are added and the channel order is changed (note that conda channel_priority is currently set to flexible),
-# verify that key dependencies are installed from the correct channel and compiled against MKL
+# verify that key dependencies are installed from the correct channel
 - conda-forge
-- defaults
+
 dependencies:
 
 # core python dependencies
-- conda-forge::python=3.6.10        # do not update
-- conda-forge::pip=21.3.1
-- conda-forge::mkl=2019.5           # MKL typically provides dramatic performance increases for theano, tensorflow, and other key dependencies
-- conda-forge::mkl-service=2.3.0
-- conda-forge::joblib=1.1.1         # must pin joblib - versions after 1.1.1 no longer support python 3.6
-- conda-forge::numpy=1.17.5         # do not update, this will break scipy=1.0.0
-                                    #   verify that numpy is compiled against MKL (e.g., by checking *_mkl_info using numpy.show_config())
-                                    #   and that it is used in tensorflow, theano, and other key dependencies
-- conda-forge::theano=1.0.4         # it is unlikely that new versions of theano will be released
-                                    #   verify that this is using numpy compiled against MKL (e.g., by the presence of -lmkl_rt in theano.config.blas.ldflags)
-- defaults::tensorflow=1.15.0       # update only if absolutely necessary, as this may cause conflicts with other core dependencies
-                                    #   verify that this is using numpy compiled against MKL (e.g., by checking tensorflow.pywrap_tensorflow.IsMklEnabled())
-- conda-forge::scipy=1.0.0          # do not update, this will break a scipy.misc.logsumexp import (deprecated in scipy=1.0.0) in pymc3=3.1
-- conda-forge::pymc3=3.1            # do not update, this will break gcnvkernel
-- conda-forge::h5py=2.10.0          # required by keras 2.2.4
-- conda-forge::keras=2.2.4          # updated from pip-installed 2.2.0, which caused various conflicts/clobbers of conda-installed packages
-                                    #   conda-installed 2.2.4 appears to be the most recent version with a consistent API and without conflicts/clobbers
-                                    #   if you wish to update, note that versions of conda-forge::keras after 2.2.5
-                                    #   undesirably set the environment variable KERAS_BACKEND = theano by default
-- defaults::intel-openmp=2019.4
-- conda-forge::scikit-learn=0.23.1
-- conda-forge::matplotlib=3.2.1
-- conda-forge::pandas=1.0.3
-- conda-forge::typing_extensions=4.1.1   # see https://github.com/broadinstitute/gatk/issues/7800 and linked PRs
-- conda-forge::dill=0.3.4                # used for pickling lambdas in TrainVariantAnnotationsModel
+- conda-forge::python=3.10.13         # do not update without good reason
+- conda-forge:pip=23.3.1
+- conda-forge:blas=1.0=mkl            # our official environment uses MKL versions of various packages; if other versions are desired, users should edit this YML accordingly
+- conda-forge::numpy=1.26.2
+- conda-forge::pymc=5.10.1
+- conda-forge::pytensor=2.18.3
+- conda-forge::scipy=1.11.4
+- conda-forge::h5py=3.10.0
+- conda-forge::pytorch=2.1.0=*mkl*100
+- conda-forge::scikit-learn=1.3.2
+- conda-forge::matplotlib=3.8.2
+- conda-forge::pandas=2.1.3
+- conda-forge::tqdm=4.66.1
+- conda-forge::dill=0.3.7             # used for pickling lambdas in TrainVariantAnnotationsModel
 
 # core R dependencies; these should only be used for plotting and do not take precedence over core python dependencies!
-- r-base=3.6.2
-- r-data.table=1.12.8
-- r-dplyr=0.8.5
-- r-getopt=1.20.3
-- r-ggplot2=3.3.0
-- r-gplots=3.0.3
-- r-gsalib=2.1
-- r-optparse=1.6.4
-- r-backports=1.1.10
+- r-base=4.3.1
+- r-data.table=1.14.8
+- r-dplyr=1.1.3
+- r-getopt=1.20.4
+- r-ggplot2=3.4.4
+- r-gplots=3.1.3
+- r-gsalib=2.2.1
+- r-optparse=1.7.3
+- r-backports=1.4.1
 
 # other python dependencies; these should be removed after functionality is moved into Java code
-- biopython=1.76
-- pyvcf=0.6.8
-- bioconda::pysam=0.15.3            # using older conda-installed versions may result in libcrypto / openssl bugs
+- bioconda::pysam=0.22.0
+- conda-forge::pyvcf=0.6.8
 
 # pip installs should be avoided, as pip may not respect the dependencies found by the conda solver
 - pip:

diff --git a/...in/java/org/broadinstitute/hellbender/tools/copynumber/DetermineGermlineContigPloidy.java b/...in/java/org/broadinstitute/hellbender/tools/copynumber/DetermineGermlineContigPloidy.java
@@ -67,12 +67,12 @@
  * <p>OpenMP and MKL parallelism can be controlled by setting the <code>OMP_NUM_THREADS</code> and <code>MKL_NUM_THREADS</code>
  * environment variables, respectively.</p>
  *
- * <p>Advanced users may wish to set the <code>THEANO_FLAGS</code> environment variable to override the GATK theano
+ * <p>Advanced users may wish to set the <code>PYTENSOR_FLAGS</code> environment variable to override the GATK PyTensor
  * configuration. For example, by running
- * <code>THEANO_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ...</code>, users can specify
- * the theano compilation directory (which is set to <code>$HOME/.theano</code> by default).  See theano documentation
- * at <a href="https://theano-pymc.readthedocs.io/en/latest/library/config.html">
- *     https://theano-pymc.readthedocs.io/en/latest/library/config.html</a>.
+ * <code>PYTENSOR_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ...</code>, users can specify
+ * the PyTensor compilation directory (which is set to <code>$HOME/.pytensor</code> by default).  See PyTensor documentation
+ * at <a href="https://pytensor.readthedocs.io/en/latest/library/config.html">
+ *     https://pytensor.readthedocs.io/en/latest/library/config.html</a>.
  * </p>
  *
  * <h3>Tool run modes</h3>

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/GermlineCNVCaller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/GermlineCNVCaller.java
@@ -92,12 +92,12 @@
  * <p>OpenMP and MKL parallelism can be controlled by setting the <code>OMP_NUM_THREADS</code> and <code>MKL_NUM_THREADS</code>
  * environment variables, respectively.</p>
  *
- * <p>Advanced users may wish to set the <code>THEANO_FLAGS</code> environment variable to override the GATK theano
+ * <p>Advanced users may wish to set the <code>PYTENSOR_FLAGS</code> environment variable to override the GATK PyTensor
  * configuration. For example, by running
- * <code>THEANO_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk GermlineCNVCaller ...</code>, users can specify
- * the theano compilation directory (which is set to <code>$HOME/.theano</code> by default).  See theano documentation
- * at <a href="https://theano-pymc.readthedocs.io/en/latest/library/config.html">
- *     https://theano-pymc.readthedocs.io/en/latest/library/config.html</a>.
+ * <code>PYTENSOR_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ...</code>, users can specify
+ * the PyTensor compilation directory (which is set to <code>$HOME/.pytensor</code> by default).  See PyTensor documentation
+ * at <a href="https://pytensor.readthedocs.io/en/latest/library/config.html">
+ *     https://pytensor.readthedocs.io/en/latest/library/config.html</a>.
  * </p>
  *
  * <h3>Resource usage</h3>

diff --git a/...main/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCalls.java b/...main/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCalls.java
@@ -89,12 +89,12 @@
  * the python environment is already set up. Otherwise, the environment must be created and activated as described in the
  * main GATK README.md file.</p>
  *
- * <p>Advanced users may wish to set the <code>THEANO_FLAGS</code> environment variable to override the GATK theano
+ * <p>Advanced users may wish to set the <code>PYTENSOR_FLAGS</code> environment variable to override the GATK PyTensor
  * configuration. For example, by running
- * <code>THEANO_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk PostprocessGermlineCNVCalls ...</code>, users can specify
- * the theano compilation directory (which is set to <code>$HOME/.theano</code> by default).  See theano documentation
- * at <a href="https://theano-pymc.readthedocs.io/en/latest/library/config.html">
- *     https://theano-pymc.readthedocs.io/en/latest/library/config.html</a>.
+ * <code>PYTENSOR_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ...</code>, users can specify
+ * the PyTensor compilation directory (which is set to <code>$HOME/.pytensor</code> by default).  See PyTensor documentation
+ * at <a href="https://pytensor.readthedocs.io/en/latest/library/config.html">
+ *     https://pytensor.readthedocs.io/en/latest/library/config.html</a>.
  * </p>
  *
  * <h3>Required inputs:</h3>

diff --git a/.../java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java b/.../java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java
@@ -204,8 +204,7 @@ private void writeDenoisingPlots(final String sampleName,
         //this runs the R statement "source("CNVPlottingLibrary.R")" before the main script runs
         executor.addScript(new Resource(PlottingUtils.CNV_PLOTTING_R_LIBRARY, PlotDenoisedCopyRatios.class));
         executor.addScript(new Resource(PLOT_DENOISED_COPY_RATIOS_R_SCRIPT, PlotDenoisedCopyRatios.class));
-        //--args is needed for Rscript to recognize other arguments properly
-        executor.addArgs("--args",
+        executor.addArgs(
                 "--sample_name=" + sampleName,
                 "--standardized_copy_ratios_file=" + CopyNumberArgumentValidationUtils.getCanonicalPath(inputStandardizedCopyRatiosFile),
                 "--denoised_copy_ratios_file=" + CopyNumberArgumentValidationUtils.getCanonicalPath(inputDenoisedCopyRatiosFile),

diff --git a/...ain/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java b/...ain/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java
@@ -285,8 +285,7 @@ private void writeModeledSegmentsPlot(final String sampleName,
         //this runs the R statement "source("CNVPlottingLibrary.R")" before the main script runs
         executor.addScript(new Resource(PlottingUtils.CNV_PLOTTING_R_LIBRARY, PlotModeledSegments.class));
         executor.addScript(new Resource(PLOT_MODELED_SEGMENTS_R_SCRIPT, PlotModeledSegments.class));
-        //--args is needed for Rscript to recognize other arguments properly
-        executor.addArgs("--args",
+        executor.addArgs(
                 "--sample_name=" + sampleName,
                 "--denoised_copy_ratios_file=" + (inputDenoisedCopyRatiosFile == null ? null : CopyNumberArgumentValidationUtils.getCanonicalPath(inputDenoisedCopyRatiosFile)),
                 "--allelic_counts_file=" + (inputAllelicCountsFile == null ? null : CopyNumberArgumentValidationUtils.getCanonicalPath(inputAllelicCountsFile)),

diff --git a/src/main/python/org/broadinstitute/hellbender/README.md b/src/main/python/org/broadinstitute/hellbender/README.md
@@ -4,3 +4,9 @@ can be installed as a standalone package, a corresponding `setup_<PACKAGE_NAME>.
 file may be placed in this directory.  However, during creation of the common 
 GATK conda environment, all packages will be combined and pip-installed as a 
 single package named ``gatkpythonpackages`` by `setup.py`.
+
+However, note that it is easier to do development by installing live/editable versions of these packages
+(i.e., running `pip install --editable .` in this directory), so that any code changes are immediately reflected in 
+the underlying environment. To do this, 1) remove the pip install of the `gatkpythonpackages.zip` archive in the 
+conda environment file, 2) create and activate the corresponding conda environment, then 
+3) run the editable pip install.
diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/README.txt b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/README.txt
@@ -8,7 +8,7 @@ germline copy number variant (gCNV) tools and workflows.
 This module implements inference schemes for read-depth profile denoising, germline
 integer copy number variation discovery, germline contig ploidy determination, associated
 I/O methods, and helper CLI scripts. `gcnvkernel` additionally provides general-purpose
-inference schemas built on the top of `PyMC3` and `theano`.
+inference schemas built on the top of `PyMC` and `pytensor`.
 
 The module is organized as follows::
 
@@ -20,7 +20,7 @@ The module is organized as follows::
         genomic intervals, read count data, global and sample-specific posteriors, and
         sample metadata.
 
-    `gcnvkernel.models`: `PyMC3` model declarations, `theano` symbolic operations (e.g.
+    `gcnvkernel.models`: `PyMC` model declarations, `pytensor` symbolic operations (e.g.
         forward-backward algorithm for HMMs), and custom probability distributions.
 
     `gcnvkernel.preprocess`: Routines for filtering interval lists.

diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/__init__.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/__init__.py
@@ -1,4 +1,4 @@
-from pymc3 import __version__ as pymc3_version
+from pymc import __version__ as pymc_version
 
 from ._version import __version__
 from .io import io_commons, io_consts, io_ploidy, io_denoising_calling, \
@@ -26,6 +26,6 @@
 from .tasks.inference_task_base import ConvergenceError
 from .utils import cli_commons, math
 
-assert pymc3_version == "3.1", "gcnvkernel currently only supports PyMC3 3.1; version found: {0}; " \
-                               "please upgrade or downgrade the PyMC3 module in your python environment " \
-                               "accordingly.".format(pymc3_version)
+assert pymc_version == "5.10.1", "gcnvkernel currently only supports PyMC 5.10.1; version found: {0}; " \
+                                 "please upgrade or downgrade the PyMC module in your python environment " \
+                                 "accordingly.".format(pymc_version)
diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/_version.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/_version.py
@@ -1 +1 @@
-__version__ = '0.8'
+__version__ = '0.9'
diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/config.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/config.py
@@ -1,4 +1,4 @@
-# let theano share memory workspace on large tensors with numpy
+# let pytensor share memory workspace on large tensors with numpy
 borrow_numpy = True
 
 # if a normalized PMF violates total probability by the following threshold, it will

diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/inference/convergence_tracker.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/inference/convergence_tracker.py
@@ -1,5 +1,5 @@
 import numpy as np
-from pymc3.variational.callbacks import Callback
+from pymc.variational.callbacks import Callback
 
 from ..utils.rls import NonStationaryLinearRegression
 

diff --git a/...main/python/org/broadinstitute/hellbender/gcnvkernel/inference/deterministic_annealing.py b/...main/python/org/broadinstitute/hellbender/gcnvkernel/inference/deterministic_annealing.py
@@ -1,4 +1,4 @@
-import pymc3 as pm
+import pymc as pm
 from .. import types
 
 Operator = pm.operators.Operator
@@ -14,16 +14,15 @@ def __init__(self,
         """Initializer.
 
         Args:
-            approx: an instance of PyMC3 approximation
-            temperature: a scalar shared theano tensor variable
+            approx: an instance of PyMC approximation
+            temperature: a scalar shared pytensor tensor variable
         """
         super().__init__(approx)
         assert temperature is not None
         self.temperature = temperature
 
     def apply(self, f):
-        z = self.input
-        return self.temperature * self.logq_norm(z) - self.logp_norm(z)
+        return (self.temperature * self.logq_norm - self.logp_norm)[0]
 
 
 class ADVIDeterministicAnnealing(Inference):
@@ -32,24 +31,24 @@ class ADVIDeterministicAnnealing(Inference):
     Note:
         Temperature is not updated automatically by this class. This task is delegated to the ADVI step
         function. This can be done by including a temperature update in `more_updates`; refer to
-        `pymc3.opvi.ObjectiveFunction.step_function` for more information.
+        `pymc.opvi.ObjectiveFunction.step_function` for more information.
 
     """
     def __init__(self,
                  local_rv=None,
                  model=None,
                  cost_part_grad_scale=1,
                  scale_cost_to_minibatch=False,
-                 random_seed=None, start=None,
+                 random_seed=None,
+                 start=None,
                  temperature=None):
 
-        assert temperature is not None, "Temperature (a scalar theano shared tensor) is not provided"
+        assert temperature is not None, "Temperature (a scalar pytensor shared tensor) is not provided"
+        approx = MeanField(local_rv=local_rv,
+                           model=model,
+                           cost_part_grad_scale=cost_part_grad_scale,
+                           scale_cost_to_minibatch=scale_cost_to_minibatch,
+                           random_seed=random_seed,
+                           start=start)
         super().__init__(
-            KLThermal, MeanField, None,
-            local_rv=local_rv,
-            model=model,
-            cost_part_grad_scale=cost_part_grad_scale,
-            scale_cost_to_minibatch=scale_cost_to_minibatch,
-            random_seed=random_seed,
-            start=start,
-            op_kwargs={'temperature': temperature})
+            KLThermal, approx, None, temperature=temperature)