[go: nahoru, domu]

Skip to content

Commit

Permalink
Enable making tf examples with maize data.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 508752677
  • Loading branch information
anastasiyabl authored and Copybara-Service committed Feb 10, 2023
1 parent b5546cd commit c11b5a1
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 14 deletions.
4 changes: 2 additions & 2 deletions deepconsensus/models/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ def _set_base_transformer_hparams(params):
# Training
params.batch_size = 256
# We use this number of epochs to obtain fast training results.
params.num_epochs = 7
params.num_epochs = 9
# We use this number of epochs to obtain the finalized models. This parameter
# keeps the learning rate schedule the same when num_epochs is changed.
params.num_epochs_for_decay = 7
params.num_epochs_for_decay = 9
params.buffer_size = 1_000_000

# Optimizer params (values obtained in b/246369335#comment3).
Expand Down
21 changes: 18 additions & 3 deletions deepconsensus/preprocess/pre_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -1029,11 +1029,26 @@ def read_truth_split(split_fname: str) -> Dict[str, str]:
"""Reads in split bed file and returns dict."""
contig_split = {}
split_regions = {}
for i in dc_constants.HUMAN_TRAIN_REGIONS:
if any([x in split_fname.lower() for x in ['chm13', 'hg00', 'human']]):
train_regions = dc_constants.TRAIN_REGIONS['HUMAN']
eval_regions = dc_constants.EVAL_REGIONS['HUMAN']
test_regions = dc_constants.TEST_REGIONS['HUMAN']
elif 'maize' in split_fname.lower():
train_regions = dc_constants.TRAIN_REGIONS['MAIZE']
eval_regions = dc_constants.EVAL_REGIONS['MAIZE']
test_regions = dc_constants.TEST_REGIONS['MAIZE']
else:
raise ValueError(
f'{split_fname} does not correspond to any genome specified in'
f' dc_constants.py. Please either either change {split_fname} name or'
' add new train/eval/test regions to dc_constants.py'
)

for i in train_regions:
split_regions[i] = 'train'
for i in dc_constants.HUMAN_EVAL_REGIONS:
for i in eval_regions:
split_regions[i] = 'eval'
for i in dc_constants.HUMAN_TEST_REGIONS:
for i in test_regions:
split_regions[i] = 'test'
with tf.io.gfile.GFile(split_fname, 'r') as f:
for line in f:
Expand Down
26 changes: 17 additions & 9 deletions deepconsensus/utils/dc_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,23 @@ class Strand(int, enum.Enum):
'EVAL': (0, 464252),
'TEST': (4178271, 4642522),
}
# chrs 1-18, X, and Y. All with and without 'chr'.
HUMAN_TRAIN_REGIONS = (
[str(i) for i in range(1, 19)]
+ ['chr%d' % i for i in range(1, 19)]
+ ['X', 'Y', 'chrX', 'chrY']
)
# chrs 21 and 22, both with and without 'chr'.
HUMAN_EVAL_REGIONS = ['21', '22', 'chr21', 'chr22']
HUMAN_TEST_REGIONS = ['19', '20', 'chr19', 'chr20']
TRAIN_REGIONS = {
'HUMAN': (
[str(i) for i in range(1, 19)]
+ ['chr%d' % i for i in range(1, 19)]
+ ['X', 'Y', 'chrX', 'chrY']
),
'MAIZE': [str(i) for i in range(1, 9)] + ['chr%d' % i for i in range(1, 9)],
}

EVAL_REGIONS = {
'HUMAN': ['21', '22', 'chr21', 'chr22'],
'MAIZE': ['9', 'chr9'],
}
TEST_REGIONS = {
'HUMAN': ['19', '20', 'chr19', 'chr20'],
'MAIZE': ['10', 'chr10'],
}

# List of features in DC examples.
DC_FEATURES = [
Expand Down

0 comments on commit c11b5a1

Please sign in to comment.