test umls update

biothings · Apr 12, 2024 · ba2fa97 · ba2fa97
1 parent f6c4507
commit ba2fa97
Show file tree

Hide file tree

Showing 2 changed files with 174 additions and 30 deletions.
diff --git a/src/hub/dataload/sources/umls/dump.py b/src/hub/dataload/sources/umls/dump.py
@@ -1,17 +1,69 @@
+import datetime
 import os
-import os.path
-import sys
-import time
 
-import biothings, config
-biothings.config_for_app(config)
+import bs4
+import dateutil.parser as dtparser
+from biothings.hub.dataload.dumper import DumperException, HTTPDumper
+from biothings.utils.common import unzipall
 
 from config import DATA_ARCHIVE_ROOT
-from biothings.hub.dataload.dumper import ManualDumper
-from biothings.utils.common import unzipall
 
 
-class UMLSDumper(ManualDumper):
+class UMLSDumper(HTTPDumper):
 
     SRC_NAME = "umls"
     SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)
+
+    SCHEDULE = "0 12 * * *"
+    HOMEPAGE_URL = "https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html"
+
+    def get_latest_release(self):
+        res = self.client.get(self.__class__.HOMEPAGE_URL)
+        # Raise error if status is not 200
+        res.raise_for_status()
+        html = bs4.BeautifulSoup(res.text, "lxml")
+        # Get the table of metathesaurus release files
+        table = html.find("table", attrs={"class": "usa-table margin-bottom-4"})
+        rows = table.find_all("tr")
+        # The header of the fifth column should be 'Date'
+        assert (
+            rows[0].find_all("th")[4].text.strip() == "Date"
+        ), "Could not parse version from html table."
+        version = rows[1].find_all("td")[4].text
+        try:
+            latest = datetime.date.strftime(dtparser.parse(version), "%Y-%m-%d")
+            return latest
+        except Exception as e:
+            raise DumperException(
+                "Can't find or parse date from table field {}: {}" % (version, e)
+            )
+
+    def create_todump_list(self, force=True):
+        self.release = self.get_latest_release()
+        if (
+            force
+            or not self.src_doc
+            or (
+                self.src_doc
+                and self.src_doc.get("download", {}).get("release") < self.release
+            )
+        ):
+            self.logger.info(
+                "Manually download from: https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html"
+            )
+            # Create data folder
+            local = os.path.join(self.SRC_ROOT_FOLDER, self.release)
+            if not os.path.exists(local):
+                os.makedirs(local)
+            # Dump a dummy file, to mark dump as successful and trigger uploader
+            release_notes = "https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/notes.html"
+            self.to_dump.append(
+                {
+                    "remote": release_notes,
+                    "local": os.path.join(local, "release_notes.html"),
+                }
+            )
+
+    def post_dump(self, *args, **kwargs):
+        self.logger.info("Unzipping files in '%s'" % self.new_data_folder)
+        unzipall(self.new_data_folder)
diff --git a/src/hub/dataload/sources/umls/parser.py b/src/hub/dataload/sources/umls/parser.py
@@ -1,37 +1,67 @@
+import glob
+import os
+import re
+import urllib
+import zipfile
+from typing import Union
+
+import bs4
+import requests
+from biothings.utils.common import open_anyfile
+
+from .umls_secret import UMLS_API_KEY
+
+try:
+    from biothings import config
+
+    logger = config.logger
+except ImportError:
+    import logging
+
+    logger = logging.getLogger(__name__)
+
+
 # Determine UMLS CUI to Entrez Gene id mappings for genes
 # 1. Parse UMLS to determine HGNC ids for each CUI
 # 2. Use HGNC to convert HGNC ids to Entrez Gene ids
-import os
 from collections import defaultdict
+
 from biothings_client import get_client
 
-GENE_CLIENT = get_client('gene')
+GENE_CLIENT = get_client("gene")
+
+
+class ParserException(Exception):
+    pass
+
 
 def query_hgnc(hgnc_ids: list) -> dict:
     """Use biothings_client.py to query hgnc ids and get back '_id' in mygene.info
-    
+
     :param: hgnc_ids: list of HGNC ids
     """
-    res = GENE_CLIENT.querymany(hgnc_ids, scopes='HGNC', fields='_id')
+    res = GENE_CLIENT.querymany(hgnc_ids, scopes="HGNC", fields="_id")
     new_res = defaultdict(list)
     for item in res:
         if not "notfound" in item:
-            new_res[item['query']].append(item['_id'])
+            new_res[item["query"]].append(item["_id"])
     return new_res
 
+
 def query_uniprot(uniprot_ids: list) -> dict:
     """Use biothings_client.py to query uniprot ids and get back '_id' in mygene.info
-    
+
     :param: uniprot_ids: list of UniProt IDs
     """
-    res = GENE_CLIENT.querymany(uniprot_ids, scopes='uniprot.Swiss-Prot', fields='_id')
+    res = GENE_CLIENT.querymany(uniprot_ids, scopes="uniprot.Swiss-Prot", fields="_id")
     new_res = defaultdict(list)
     for item in res:
         if not item.get("notfound"):
-            new_res[item['query']].append(item['_id'])
+            new_res[item["query"]].append(item["_id"])
     return new_res
 
-def parse_mrcon(rrf_file):
+
+def parse_mrcon(archive_path, data_path: Union[str, bytes]):
     """Parse the UMLS to determine the HGNC identifier of each gene CUI.
 
     The relevant files are in the archive <version>-1-meta.nlm (a zip file)
@@ -43,27 +73,28 @@ def parse_mrcon(rrf_file):
 
     res = defaultdict(set)
     hgnc_ids = set()
-    with open(rrf_file, "r") as fin:
+    with open_anyfile((archive_path, data_path), "r") as fin:
         for line in fin:
             if "HGNC:" in line:
                 vals = line.rstrip("\n").split("|")
 
                 cui = vals[0]
                 for val in vals[1:]:
                     if val.startswith("HGNC:"):
-                        res[val.split(':')[-1]].add(cui)
-                        hgnc_ids.add(val.split(':')[-1])
+                        res[val.split(":")[-1]].add(cui)
+                        hgnc_ids.add(val.split(":")[-1])
     return res, hgnc_ids
 
-def parse_mrsat(rrf_file):
+
+def parse_mrsat(archive_path, data_path: Union[str, bytes]):
     """Parse the UMLS to determine the UniProt identifier of each protein CUI.
 
     The relevant file is MRSAT.RRF, which is downloaded from https://download.nlm.nih.gov/umls/kss/2019AB/umls-2019AB-metathesaurus.zip
     """
 
     res = defaultdict(set)
     uniprot_ids = set()
-    with open(rrf_file, "r") as fin:
+    with open_anyfile((archive_path, data_path), "r") as fin:
         for line in fin:
             if "SWISS_PROT" in line:
                 vals = line.rstrip("\n").split("|")
@@ -72,28 +103,89 @@ def parse_mrsat(rrf_file):
                 uniprot_ids.add(vals[-4])
     return res, uniprot_ids
 
+
 def unlist(l):
     l = list(l)
     if len(l) == 1:
         return l[0]
     return l
 
+
+def get_download_url():
+    res = requests.get(
+        "https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html"
+    )
+    # Raise error if status is not 200
+    res.raise_for_status()
+    html = bs4.BeautifulSoup(res.text, "lxml")
+    # Get the table of metathesaurus release files
+    table = html.find("table", attrs={"class": "mb-4"})
+    rows = table.find_all("tr")
+    # The header of the first column should be 'Release'
+    assert (
+        rows[0].find_all("th")[0].text == "Release"
+    ), "Could not parse url from html table."
+    try:
+        # Get the url from the link
+        url = rows[1].find_all("td")[0].a["href"]
+        # Create the url using the api aky
+        url = f"https://uts-ws.nlm.nih.gov/download?url={url}&apiKey={UMLS_API_KEY}"
+        return url
+    except Exception as e:
+        raise ParserException(f"Can't find or parse url from table field {url}: {e}")
+
+
 def load_data(data_folder):
-    mrsat_file = os.path.join(data_folder, 'MRSAT.RRF')
-    mrconso_file = os.path.join(data_folder, 'MRCONSO.RRF') 
-    hgnc_map, hgnc_ids = parse_mrcon(mrconso_file)
-    uniprot_map, uniprot_ids = parse_mrsat(mrsat_file)
+    try:
+        metathesaurus_file = glob.glob(
+            os.path.join(data_folder, "*metathesaurus-release.zip")
+        )[0]
+    except IndexError:
+        url = get_download_url()
+        # Use re.sub to replace all characters after "apiKey=" with asterisks
+        pii_url = re.sub(
+            r"(apiKey=).*",
+            r"\1" + "*" * len(re.search(r"(apiKey=)(.*)", url).group(2)),
+            url,
+        )
+        logger.info(
+            """Could not find metathesaurus archive in {}.
+                     Downloading UMLS Metathesaurus file automatically:
+                     {}
+                     """.format(
+                data_folder, pii_url
+            )
+        )
+        # Download UMLS file to data folder
+        urllib.request.urlretrieve(
+            url, os.path.join(data_folder, "metathesaurus-release.zip")
+        )
+        # Get the downloaded file path
+        metathesaurus_file = glob.glob(
+            os.path.join(data_folder, "*metathesaurus-release.zip")
+        )[0]
+    file_list = zipfile.ZipFile(metathesaurus_file, mode="r").namelist()
+    try:
+        mrsty_path = [f for f in file_list if f.endswith("MRSTY.RRF")][0]
+    except IndexError:
+        raise FileNotFoundError("Could not find MRSTY.RRF in archive.")
+    try:
+        mrconso_path = [f for f in file_list if f.endswith("MRCONSO.RRF")][0]
+    except IndexError:
+        raise FileNotFoundError("Could not find MRCONSO.RRF in archive.")
+
+    hgnc_map, hgnc_ids = parse_mrcon(metathesaurus_file, mrconso_path)
+    uniprot_map, uniprot_ids = parse_mrsat(metathesaurus_file, mrsty_path)
     res = {}
     hgnc2mygeneids = query_hgnc(hgnc_ids)
     uniprot2mygeneids = query_uniprot(uniprot_ids)
     for hgnc, _ids in hgnc2mygeneids.items():
         for _id in _ids:
-            res[_id] = {'cui': unlist(hgnc_map[hgnc])}
+            res[_id] = {"cui": unlist(hgnc_map[hgnc])}
     for uniprot, _ids in uniprot2mygeneids.items():
         for _id in _ids:
             if _id not in res:
                 res[_id] = {}
-            res[_id]['protein_cui'] = unlist(uniprot_map[uniprot])
+            res[_id]["protein_cui"] = unlist(uniprot_map[uniprot])
     for _id, item in res.items():
-        yield {'_id': _id,
-               'umls': item}
+        yield {"_id": _id, "umls": item}