-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/master' into add-curie-id-support
- Loading branch information
Showing
15 changed files
with
599 additions
and
101 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
-e git+https://github.com/biothings/biothings.api.git@0.12.x#egg=biothings[hub] | ||
biopython # refseq | ||
pandas # umls | ||
lxml # bs4 parsing |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,71 @@ | ||
import datetime | ||
import os | ||
import os.path | ||
import sys | ||
import time | ||
|
||
import biothings, config | ||
biothings.config_for_app(config) | ||
import bs4 | ||
import dateutil.parser as dtparser | ||
from biothings.hub.dataload.dumper import DumperException, HTTPDumper | ||
from biothings.utils.common import unzipall | ||
|
||
from config import DATA_ARCHIVE_ROOT | ||
from biothings.hub.dataload.dumper import ManualDumper | ||
from biothings.utils.common import unzipall | ||
|
||
|
||
class UMLSDumper(ManualDumper): | ||
class UMLSDumper(HTTPDumper): | ||
|
||
SRC_NAME = "umls" | ||
SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME) | ||
|
||
SCHEDULE = "0 12 * * *" | ||
HOMEPAGE_URL = "https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html" | ||
|
||
def get_latest_release(self): | ||
res = self.client.get(self.__class__.HOMEPAGE_URL) | ||
# Raise error if status is not 200 | ||
res.raise_for_status() | ||
html = bs4.BeautifulSoup(res.text, "lxml") | ||
# Get the table of metathesaurus release files | ||
table = html.find( | ||
"table", attrs={"class": "usa-table border-base-lighter margin-bottom-4"} | ||
) | ||
rows = table.find_all("tr") | ||
# The header of the fifth column should be 'Date' | ||
assert ( | ||
rows[0].find_all("th")[4].text.strip() == "Date" | ||
), "Could not parse version from html table." | ||
version = rows[1].find_all("td")[4].text | ||
try: | ||
latest = datetime.date.strftime(dtparser.parse(version), "%Y-%m-%d") | ||
return latest | ||
except Exception as e: | ||
raise DumperException( | ||
"Can't find or parse date from table field {}: {}" % (version, e) | ||
) | ||
|
||
def create_todump_list(self, force=True): | ||
self.release = self.get_latest_release() | ||
if ( | ||
force | ||
or not self.src_doc | ||
or ( | ||
self.src_doc | ||
and self.src_doc.get("download", {}).get("release") < self.release | ||
) | ||
): | ||
self.logger.info( | ||
"Manually download from: https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html" | ||
) | ||
# Create data folder | ||
local = os.path.join(self.SRC_ROOT_FOLDER, self.release) | ||
if not os.path.exists(local): | ||
os.makedirs(local) | ||
# Dump a dummy file, to mark dump as successful and trigger uploader | ||
release_notes = "https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/notes.html" | ||
self.to_dump.append( | ||
{ | ||
"remote": release_notes, | ||
"local": os.path.join(local, "release_notes.html"), | ||
} | ||
) | ||
|
||
def post_dump(self, *args, **kwargs): | ||
self.logger.info("Unzipping files in '%s'" % self.new_data_folder) | ||
unzipall(self.new_data_folder) |
Oops, something went wrong.