[go: nahoru, domu]

Skip to content

Commit

Permalink
Update inspecting examples colab.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 512956150
  • Loading branch information
anastasiyabl authored and Copybara-Service committed Feb 28, 2023
1 parent 21a5346 commit 7dd3bb0
Showing 1 changed file with 60 additions and 47 deletions.
107 changes: 60 additions & 47 deletions notebooks/Inspecting_DeepConsensus_examples_and_running_model.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@
"Requirement already satisfied: wheel in /usr/local/lib/python3.8/dist-packages (0.38.4)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0mLooking in indexes: https://test.pypi.org/simple/, https://us-python.pkg.dev/colab-wheels/public/simple/, https://pypi.org/simple\n",
"Collecting deepconsensus[cpu]==1.1.0\n",
" Using cached https://test-files.pythonhosted.org/packages/6e/01/790fbf619116eac308509c1c029bf303ff6684afea77742b43dd150da6ec/deepconsensus-1.1.0-py3-none-any.whl (157 kB)\n",
"Collecting deepconsensus[cpu]==1.2.0\n",
" Using cached https://test-files.pythonhosted.org/packages/6e/01/790fbf619116eac308509c1c029bf303ff6684afea77742b43dd150da6ec/deepconsensus-1.2.0-py3-none-any.whl (157 kB)\n",
"Collecting tf-models-official==2.9.2\n",
" Using cached tf_models_official-2.9.2-py2.py3-none-any.whl (2.1 MB)\n",
"Requirement already satisfied: absl-py==1.0.0 in /usr/local/lib/python3.8/dist-packages (from deepconsensus[cpu]==1.1.0) (1.0.0)\n",
Expand Down Expand Up @@ -174,15 +174,15 @@
" Found existing installation: pandas 1.3.5\n",
" Uninstalling pandas-1.3.5:\n",
" Successfully uninstalled pandas-1.3.5\n",
"Successfully installed deepconsensus-1.1.0 intel-tensorflow-2.9.1 ml-collections-0.1.1 pandas-1.5.1 sacrebleu-2.3.1 seqeval-1.2.2 tensorflow-addons-0.19.0 tensorflow-text-2.9.0 tf-models-official-2.9.2\n",
"Successfully installed deepconsensus-1.2.0 intel-tensorflow-2.9.1 ml-collections-0.1.1 pandas-1.5.1 sacrebleu-2.3.1 seqeval-1.2.2 tensorflow-addons-0.19.0 tensorflow-text-2.9.0 tf-models-official-2.9.2\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m"
]
}
],
"source": [
"! pip install --upgrade pip setuptools wheel\n",
"! pip install deepconsensus[cpu]==1.1.0"
"! pip install deepconsensus[cpu]==1.2.0"
]
},
{
Expand All @@ -197,8 +197,7 @@
"import colorama\n",
"import numpy as np\n",
"import random\n",
"import PIL\n",
"from IPython.display import display\n",
"import matplotlib.pyplot as plt\n",
"from deepconsensus.models import model_configs\n",
"from deepconsensus.models import model_utils\n",
"from deepconsensus.models import data_providers\n",
Expand All @@ -222,47 +221,52 @@
},
"outputs": [],
"source": [
"def plot_array(arr, scale = 5):\n",
" \"\"\"Transforms array for plotting.\"\"\"\n",
" arr = np.copy(arr).astype('float64')\n",
" arr = np.where(arr \u003e 1.0e-10, arr, -10.0)\n",
" arr = np.log(arr, where=arr \u003e 0.0, out=arr) * 100.0\n",
" arr = np.where(arr \u003e 0, arr + 25.0, 0)\n",
" arr = arr.astype('uint8')\n",
" im = PIL.Image.fromarray(arr, 'P')\n",
" im = im.resize((im.size[0] * scale, im.size[1] * scale))\n",
" display(im)\n",
"def plot_array(arr):\n",
" \"\"\"Plot the array.\"\"\"\n",
" plt.figure(figsize = (20,10))\n",
" plt.imshow(arr, cmap=\"Greys_r\", vmin=0, vmax=20)\n",
" plt.axis('off')\n",
" plt.show()\n",
"\n",
"def show_rows_for_one_example(rows, max_passes=20):\n",
"def show_rows_for_one_example(rows, max_passes=20, use_ccs_bq=True):\n",
" \"\"\"Break out the black/white matrix into subreads, ccs, PW, IP, etc.\n",
"\n",
" For each of `max_subreads`, we have four pieces of information: bases, PW, IP,\n",
" and strand. We also have one row for CCS, and four rows for SN (in that\n",
" order).\n",
" The information is structured as follows:\n",
" Bases: rows 0 to (params.max_passes - 1)\n",
" PW: rows (params.max_passes) to (params.max_passes * 2 - 1)\n",
" IP: rows (params.max_passes * 2) to (params.max_passes * 3 - 1)\n",
" Strand: rows (params.max_passes * 3) to (params.max_passes * 4 - 1)\n",
" CCS+SN: rows (params.max_passes * 4) to (params.max_passes * 4 + 5)\n",
" For each of `max_subreads`, we have multiple pieces of information: bases, PW,\n",
" IP, and strand. We also have one row for CCS, another row for CCS base\n",
" qualitites (optionally) and four rows for SN.\n",
" \"\"\"\n",
" (\n",
" base_indices,\n",
" pw_indices,\n",
" ip_indices,\n",
" strand_indices,\n",
" ccs_indices,\n",
" ccs_bq_indices,\n",
" sn_indices,\n",
" ) = data_providers.get_indices(max_passes, use_ccs_bq)\n",
" \n",
" print(base_indices)\n",
" print('bases:')\n",
" plot_array(rows[0:max_passes])\n",
" plot_array(rows[slice(*base_indices)])\n",
" print('PW: pulse-width')\n",
" pw = rows[max_passes:max_passes*2]\n",
" pw = rows[slice(*pw_indices)]\n",
" plot_array(pw)\n",
" print('IP: inter-pulse duration')\n",
" ip = rows[max_passes*2:max_passes*3]\n",
" ip = rows[slice(*ip_indices)]\n",
" plot_array(ip)\n",
"\n",
" print('Strand:')\n",
" plot_array(rows[max_passes*3:-5])\n",
" plot_array(rows[slice(*strand_indices)])\n",
"\n",
" print('CCS:')\n",
" ccs = rows[-5:-4]\n",
" ccs = rows[slice(*ccs_indices)]\n",
" plot_array(ccs)\n",
" if use_ccs_bq:\n",
" print('CCS base quality scores:')\n",
" ccs_bq = rows[slice(*ccs_bq_indices)]\n",
" plot_array(ccs_bq)\n",
" print('SN:')\n",
" sn = rows[-4:]\n",
" sn = rows[slice(*sn_indices)]\n",
" plot_array(sn)"
]
},
Expand All @@ -279,14 +283,19 @@
" return ''.join([dc_constants.SEQ_VOCAB[int(base)] for base in encoded_sequence])\n",
"\n",
"\n",
"def get_ccs_matrix(rows_for_one_example):\n",
"def get_ccs_matrix(rows_for_one_example, max_passes, use_ccs_bq):\n",
" \"\"\"Slice the encoded CCS out of the 2d rows matrix of one example.\"\"\"\n",
" return rows_for_one_example[-5, :]\n",
" _, _, _, _, ccs_indices, _, _ = data_providers.get_indices(\n",
" max_passes=max_passes,\n",
" use_ccs_bq=use_ccs_bq,\n",
" )\n",
" return rows_for_one_example[slice(*ccs_indices), :][0]\n",
"\n",
"\n",
"def get_ccs_seq(rows_for_one_example):\n",
"def get_ccs_seq(rows_for_one_example, max_passes, use_ccs_bq):\n",
" \"\"\"Get CCS string out of the 2d rows matrix of one example.\"\"\"\n",
" return encoded_sequence_to_string(get_ccs_matrix(rows_for_one_example))\n",
" return encoded_sequence_to_string(get_ccs_matrix(\n",
" rows_for_one_example, max_passes, use_ccs_bq))\n",
"\n",
"\n",
"def colorful(seq):\n",
Expand All @@ -305,11 +314,11 @@
" return ''.join(colored_seq)\n",
"\n",
"\n",
"def show_example(batch, example_i, ypreds=None, max_passes=20):\n",
"def show_example(batch, example_i, ypreds=None, max_passes=20, use_ccs_bq=True):\n",
" \"\"\"Show an example with subreads, ccs, predictions, and labels.\"\"\"\n",
" batch_size = batch['rows'].shape[0]\n",
" rows = batch['rows'][example_i, :, :, 0]\n",
" ccs_i = get_ccs_seq(rows)\n",
" ccs_i = get_ccs_seq(rows, max_passes, use_ccs_bq)\n",
" print(f'Example: {example_i} of {batch_size} (batch)')\n",
" print('How the sequences are represented for the model:')\n",
" subreads = rows[0:max_passes, :]\n",
Expand Down Expand Up @@ -360,12 +369,12 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/checkpoint.data-00000-of-00001...\n",
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/checkpoint.index...\n",
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/params.json...\n",
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/checkpoint.data-00000-of-00001...\n",
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/checkpoint.index...\n",
"Copying gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/params.json...\n",
"- [3 files][ 85.7 MiB/ 85.7 MiB] \n",
"Operation completed over 3 objects/85.7 MiB. \n",
"Copying gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/test/tf-test-00000-of-00500.tfrecord.gz...\n",
"Copying gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.2/test/tf-test-00000-of-00500.tfrecord.gz...\n",
"/ [1 files][ 6.9 MiB/ 6.9 MiB] \n",
"Operation completed over 1 objects/6.9 MiB. \n"
]
Expand All @@ -374,9 +383,9 @@
"source": [
"# Download Model\n",
"! mkdir -p deepconsensus_model\n",
"! gsutil cp -r gs://brain-genomics-public/research/deepconsensus/models/v1.1/model_checkpoint/* deepconsensus_model/\n",
"! gsutil cp -r gs://brain-genomics-public/research/deepconsensus/models/v1.2/model_checkpoint/* deepconsensus_model/\n",
"# Download test data\n",
"! gsutil cp gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.1/test/tf-test-00000-of-00500.tfrecord.gz ./tf-test.tfrecord.gz"
"! gsutil cp gs://brain-genomics-public/research/deepconsensus/training-tutorial/v1.2/test/tf-test-00000-of-00500.tfrecord.gz ./tf-test.tfrecord.gz"
]
},
{
Expand Down Expand Up @@ -451,7 +460,7 @@
"model = model_utils.get_model(params)\n",
"checkpoint = tf.train.Checkpoint(model=model)\n",
"\n",
"row_size = data_providers.get_total_rows(params.max_passes)\n",
"row_size = data_providers.get_total_rows(params.max_passes, params.use_ccs_bq)\n",
"input_shape = (1, row_size, params.max_length, params.num_channels)\n",
"model_utils.print_model_summary(model, input_shape)\n",
"checkpoint.restore(\n",
Expand Down Expand Up @@ -500,7 +509,7 @@
"text": [
"name.shape: (20, 1)\n",
"label.shape: (20, 100)\n",
"rows.shape: (20, 85, 100, 1)\n",
"rows.shape: (20, 86, 100, 1)\n",
"num_passes.shape: (20, 1)\n",
"window_pos.shape: (20, 1)\n"
]
Expand Down Expand Up @@ -604,7 +613,7 @@
"source": [
"### Let's break that down\n",
"\n",
"Each matrix is composed of multiple data types layered in rows. The bases, pulse-width, inter-pulse duration, and strand are per-subread, up to the max number of subreads (this is in params.max_passes, which as of DeepConsensus v1.1 is 20). The input also contains a draft circular consensus sequence (CCS) from PacBio and the signal-to-noise ratio (SN)."
"Each matrix is composed of multiple data types layered in rows. The bases, pulse-width, inter-pulse duration, and strand are per-subread, up to the max number of subreads (this is in params.max_passes, which as of DeepConsensus v1.2 is 20). The input also contains a draft circular consensus sequence (CCS) from PacBio and the signal-to-noise ratio (SN)."
]
},
{
Expand Down Expand Up @@ -978,6 +987,10 @@
],
"metadata": {
"colab": {
"last_runtime": {
"build_target": "//learning/genomics/internal:genomics_colab",
"kind": "private"
},
"provenance": [
{
"file_id": "/piper/depot/google3/learning/genomics/deepconsensus/opensource_only/g3doc/notebooks/Copy_of_Inspecting_DeepConsensus_examples_and_running_model.ipynb?workspaceId=belyaeva:dc_colab::citc",
Expand Down

0 comments on commit 7dd3bb0

Please sign in to comment.