Merge remote-tracking branch 'origin/master' into add-curie-id-support

biothings · Apr 19, 2024 · 920b744 · 920b744
2 parents cf11bdb + 03c90fa
commit 920b744
Show file tree

Hide file tree

Showing 15 changed files with 599 additions and 101 deletions.
diff --git a/requirements_hub.txt b/requirements_hub.txt
@@ -1,3 +1,4 @@
 -e git+https://github.com/biothings/biothings.api.git@0.12.x#egg=biothings[hub]
 biopython # refseq
 pandas # umls
+lxml              # bs4 parsing
diff --git a/src/config_hub.py b/src/config_hub.py
@@ -4,13 +4,13 @@
 
 # Refer to biothings.hub.default_config for all configurable settings
 
-DATA_SRC_SERVER = 'localhost'
+DATA_SRC_SERVER = "localhost"
 DATA_SRC_PORT = 27017
-DATA_SRC_DATABASE = 'mygene_src'
+DATA_SRC_DATABASE = "mygene_src"
 
-DATA_TARGET_SERVER = 'localhost'
+DATA_TARGET_SERVER = "localhost"
 DATA_TARGET_PORT = 27017
-DATA_TARGET_DATABASE = 'mygene'
+DATA_TARGET_DATABASE = "mygene"
 
 HUB_DB_BACKEND = {
     "module": "biothings.utils.mongo",
@@ -33,26 +33,28 @@
     },
     "env": {
         "prod": {
-            "host": "<PRODSERVER>:9200",
+            "host": "http://<PRODSERVER>:9200",
             "indexer": {
                 "args": {
                     "timeout": 300,
                     "retry_on_timeout": True,
                     "max_retries": 10,
                 },
             },
-            "index": [{"index": "genedoc_mygene_allspecies_current", "doc_type": "gene"}]
+            "index": [
+                {"index": "genedoc_mygene_allspecies_current", "doc_type": "gene"}
+            ],
         },
         "local": {
-            "host": "localhost:9200",
+            "host": "http://localhost:9200",
             "indexer": {
                 "args": {
                     "timeout": 300,
                     "retry_on_timeout": True,
                     "max_retries": 10,
                 },
             },
-            "index": [{"index": "mygene_gene_allspecies_current", "doc_type": "gene"}]
+            "index": [{"index": "mygene_gene_allspecies_current", "doc_type": "gene"}],
         },
     },
 }
@@ -71,9 +73,9 @@
                 "name": "gene_repository-$(Y)",
                 "type": "s3",
                 "settings": {
-                        "bucket": "<SNAPSHOT_BUCKET_NAME>",
-                        "base_path": "mygene.info/$(Y)",  # per year
-                        "region": "us-west-2",
+                    "bucket": "<SNAPSHOT_BUCKET_NAME>",
+                    "base_path": "mygene.info/$(Y)",  # per year
+                    "region": "us-west-2",
                 },
                 "acl": "private",
             },
@@ -95,9 +97,9 @@
                 "name": "gene_repository-demo-$(Y)",
                 "type": "s3",
                 "settings": {
-                        "bucket": "<SNAPSHOT_DEMO_BUCKET_NAME>",
-                        "base_path": "mygene.info/$(Y)",  # per year
-                        "region": "us-west-2",
+                    "bucket": "<SNAPSHOT_DEMO_BUCKET_NAME>",
+                    "base_path": "mygene.info/$(Y)",  # per year
+                    "region": "us-west-2",
                 },
                 "acl": "public",
             },
@@ -108,7 +110,7 @@
             # when creating a snapshot, how long should we wait before querying ES
             # to check snapshot status/completion ? (in seconds)
             "monitor_delay": 10,
-        }
+        },
     }
 }
 
@@ -153,7 +155,7 @@
                 "region": "us-west-2",
                 "auto": True,  # automatically generate diff ? Careful if lots of changes
             },
-        }
+        },
     }
 }
 
@@ -164,38 +166,38 @@
 # Autohub configuration, either from a static definition...
 STANDALONE_CONFIG = {
     "_default": {
-        "es_host": "localhost:9200",
+        "es_host": "http://localhost:9200",
         "index": "mygene_test",
-        "doc_type": "gene"
+        "doc_type": "gene",
     },
     "mygene.info": {
         "es_host": "prodserver:9200",
         "index": "mygene_prod",
-        "doc_type": "gene"
+        "doc_type": "gene",
     },
 }
 # ... or using a dynamic indexer factory and ES host (index names are then
 # taken from VERSION_URLS and all are managed on one given ES host)
-#AUTOHUB_INDEXER_FACTORY = "biothings.hub.dataindex.indexer.DynamicIndexerFactory"
-#AUTOHUB_ES_HOST = "localhost:9200"
+# AUTOHUB_INDEXER_FACTORY = "biothings.hub.dataindex.indexer.DynamicIndexerFactory"
+# AUTOHUB_ES_HOST = "localhost:9200"
 
 # Autohub configuration, either from a static definition...
 STANDALONE_CONFIG = {
     "_default": {
-        "es_host": "localhost:9200",
+        "es_host": "http:/localhost:9200",
         "index": "mygene_test",
-        "doc_type": "gene"
+        "doc_type": "gene",
     },
     "mygene.info": {
         "es_host": "prodserver:9200",
         "index": "mygene_prod",
-        "doc_type": "gene"
+        "doc_type": "gene",
     },
 }
 # ... or using a dynamic indexer factory and ES host (index names are then
 # taken from VERSION_URLS and all are managed on one given ES host)
-#AUTOHUB_INDEXER_FACTORY = "biothings.hub.dataindex.indexer.DynamicIndexerFactory"
-#AUTOHUB_ES_HOST = "localhost:9200"
+# AUTOHUB_INDEXER_FACTORY = "biothings.hub.dataindex.indexer.DynamicIndexerFactory"
+# AUTOHUB_ES_HOST = "localhost:9200"
 
 
 ########################################
@@ -221,5 +223,10 @@
     "zebrafish": {"tax_id": "7955", "assembly": "zv9"},
     "thale-cress": {"tax_id": "3702"},
     "frog": {"tax_id": "8364", "assembly": "xenTro3"},
-    "pig": {"tax_id": "9823", "assembly": "susScr2"}
+    "pig": {"tax_id": "9823", "assembly": "susScr2"},
 }
+
+# for running tests locally in our biothings hub with testing api
+APITEST_PATH = "data_tests"
+
+APITEST_CONFIG = "config_web_local"
diff --git a/src/config_web.py b/src/config_web.py
@@ -11,7 +11,7 @@
 # *****************************************************************************
 # Elasticsearch Settings
 # *****************************************************************************
-ES_HOST = "localhost:9200"
+ES_HOST = "http://localhost:9200"
 ES_INDEX = "mygene_current"
 ES_DOC_TYPE = "gene"
 
@@ -196,6 +196,3 @@
 
 # url template to redirect for 'include_tax_tree' parameter
 INCLUDE_TAX_TREE_REDIRECT_ENDPOINT = "http://t.biothings.io/v1/taxon"
-
-# for running tests locally in our biothings client
-PYTEST_PATH = "tests/data_tests"
diff --git a/src/hub/dataload/sources/clingen/upload.py b/src/hub/dataload/sources/clingen/upload.py
@@ -23,7 +23,7 @@ class ClingenUploader(biothings.hub.dataload.uploader.BaseSourceUploader):
         }
     }
     idconverter = None
-    storage_class = biothings.hub.dataload.storage.IgnoreDuplicatedStorage
+    storage_class = biothings.utils.storage.IgnoreDuplicatedStorage
 
     def load_data(self, data_folder):
         self.logger.info("Load data from directory: '%s'" % data_folder)

diff --git a/src/hub/dataload/sources/reactome/dump.py b/src/hub/dataload/sources/reactome/dump.py
@@ -1,14 +1,20 @@
+import datetime
+import os
+import sys
+import time
 
+import biothings
+import requests
 
-import os, sys, time, datetime
+import config
 
-import biothings, config
 biothings.config_for_app(config)
 
-from config import DATA_ARCHIVE_ROOT
 from biothings.hub.dataload.dumper import LastModifiedHTTPDumper
 from biothings.utils.common import unzipall
 
+from config import DATA_ARCHIVE_ROOT
+
 
 class ReactomeDumper(LastModifiedHTTPDumper):
 
@@ -18,3 +24,9 @@ class ReactomeDumper(LastModifiedHTTPDumper):
     SRC_URLS = ["https://reactome.org/download/current/NCBI2Reactome_All_Levels.txt"]
     SCHEDULE = "0 6 * * *"
 
+    def set_release(self):
+        self.release = str(
+            requests.get(
+                "https://reactome.org/ContentService/data/database/version"
+            ).json()
+        )
diff --git a/src/hub/dataload/sources/umls/dump.py b/src/hub/dataload/sources/umls/dump.py
@@ -1,17 +1,71 @@
+import datetime
 import os
-import os.path
-import sys
-import time
 
-import biothings, config
-biothings.config_for_app(config)
+import bs4
+import dateutil.parser as dtparser
+from biothings.hub.dataload.dumper import DumperException, HTTPDumper
+from biothings.utils.common import unzipall
 
 from config import DATA_ARCHIVE_ROOT
-from biothings.hub.dataload.dumper import ManualDumper
-from biothings.utils.common import unzipall
 
 
-class UMLSDumper(ManualDumper):
+class UMLSDumper(HTTPDumper):
 
     SRC_NAME = "umls"
     SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)
+
+    SCHEDULE = "0 12 * * *"
+    HOMEPAGE_URL = "https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html"
+
+    def get_latest_release(self):
+        res = self.client.get(self.__class__.HOMEPAGE_URL)
+        # Raise error if status is not 200
+        res.raise_for_status()
+        html = bs4.BeautifulSoup(res.text, "lxml")
+        # Get the table of metathesaurus release files
+        table = html.find(
+            "table", attrs={"class": "usa-table border-base-lighter margin-bottom-4"}
+        )
+        rows = table.find_all("tr")
+        # The header of the fifth column should be 'Date'
+        assert (
+            rows[0].find_all("th")[4].text.strip() == "Date"
+        ), "Could not parse version from html table."
+        version = rows[1].find_all("td")[4].text
+        try:
+            latest = datetime.date.strftime(dtparser.parse(version), "%Y-%m-%d")
+            return latest
+        except Exception as e:
+            raise DumperException(
+                "Can't find or parse date from table field {}: {}" % (version, e)
+            )
+
+    def create_todump_list(self, force=True):
+        self.release = self.get_latest_release()
+        if (
+            force
+            or not self.src_doc
+            or (
+                self.src_doc
+                and self.src_doc.get("download", {}).get("release") < self.release
+            )
+        ):
+            self.logger.info(
+                "Manually download from: https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html"
+            )
+            # Create data folder
+            local = os.path.join(self.SRC_ROOT_FOLDER, self.release)
+            if not os.path.exists(local):
+                os.makedirs(local)
+            # Dump a dummy file, to mark dump as successful and trigger uploader
+            release_notes = "https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/notes.html"
+            self.to_dump.append(
+                {
+                    "remote": release_notes,
+                    "local": os.path.join(local, "release_notes.html"),
+                }
+            )
+
+    def post_dump(self, *args, **kwargs):
+        self.logger.info("Unzipping files in '%s'" % self.new_data_folder)
+        unzipall(self.new_data_folder)