Enable making tf examples with maize data.

PiperOrigin-RevId: 508752677
google · Feb 10, 2023 · c11b5a1 · c11b5a1
1 parent b5546cd
commit c11b5a1
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 14 deletions.
diff --git a/deepconsensus/models/model_configs.py b/deepconsensus/models/model_configs.py
@@ -108,10 +108,10 @@ def _set_base_transformer_hparams(params):
   # Training
   params.batch_size = 256
   # We use this number of epochs to obtain fast training results.
-  params.num_epochs = 7
+  params.num_epochs = 9
   # We use this number of epochs to obtain the finalized models. This parameter
   # keeps the learning rate schedule the same when num_epochs is changed.
-  params.num_epochs_for_decay = 7
+  params.num_epochs_for_decay = 9
   params.buffer_size = 1_000_000
 
   # Optimizer params (values obtained in b/246369335#comment3).

diff --git a/deepconsensus/preprocess/pre_lib.py b/deepconsensus/preprocess/pre_lib.py
@@ -1029,11 +1029,26 @@ def read_truth_split(split_fname: str) -> Dict[str, str]:
   """Reads in split bed file and returns dict."""
   contig_split = {}
   split_regions = {}
-  for i in dc_constants.HUMAN_TRAIN_REGIONS:
+  if any([x in split_fname.lower() for x in ['chm13', 'hg00', 'human']]):
+    train_regions = dc_constants.TRAIN_REGIONS['HUMAN']
+    eval_regions = dc_constants.EVAL_REGIONS['HUMAN']
+    test_regions = dc_constants.TEST_REGIONS['HUMAN']
+  elif 'maize' in split_fname.lower():
+    train_regions = dc_constants.TRAIN_REGIONS['MAIZE']
+    eval_regions = dc_constants.EVAL_REGIONS['MAIZE']
+    test_regions = dc_constants.TEST_REGIONS['MAIZE']
+  else:
+    raise ValueError(
+        f'{split_fname} does not correspond to any genome specified in'
+        f' dc_constants.py. Please either either change {split_fname} name or'
+        ' add new train/eval/test regions to dc_constants.py'
+    )
+
+  for i in train_regions:
     split_regions[i] = 'train'
-  for i in dc_constants.HUMAN_EVAL_REGIONS:
+  for i in eval_regions:
     split_regions[i] = 'eval'
-  for i in dc_constants.HUMAN_TEST_REGIONS:
+  for i in test_regions:
     split_regions[i] = 'test'
   with tf.io.gfile.GFile(split_fname, 'r') as f:
     for line in f:

diff --git a/deepconsensus/utils/dc_constants.py b/deepconsensus/utils/dc_constants.py
@@ -92,15 +92,23 @@ class Strand(int, enum.Enum):
     'EVAL': (0, 464252),
     'TEST': (4178271, 4642522),
 }
-# chrs 1-18, X, and Y. All with and without 'chr'.
-HUMAN_TRAIN_REGIONS = (
-    [str(i) for i in range(1, 19)]
-    + ['chr%d' % i for i in range(1, 19)]
-    + ['X', 'Y', 'chrX', 'chrY']
-)
-# chrs 21 and 22, both with and without 'chr'.
-HUMAN_EVAL_REGIONS = ['21', '22', 'chr21', 'chr22']
-HUMAN_TEST_REGIONS = ['19', '20', 'chr19', 'chr20']
+TRAIN_REGIONS = {
+    'HUMAN': (
+        [str(i) for i in range(1, 19)]
+        + ['chr%d' % i for i in range(1, 19)]
+        + ['X', 'Y', 'chrX', 'chrY']
+    ),
+    'MAIZE': [str(i) for i in range(1, 9)] + ['chr%d' % i for i in range(1, 9)],
+}
+
+EVAL_REGIONS = {
+    'HUMAN': ['21', '22', 'chr21', 'chr22'],
+    'MAIZE': ['9', 'chr9'],
+}
+TEST_REGIONS = {
+    'HUMAN': ['19', '20', 'chr19', 'chr20'],
+    'MAIZE': ['10', 'chr10'],
+}
 
 # List of features in DC examples.
 DC_FEATURES = [