[go: nahoru, domu]

Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into add-curie-id-support
Browse files Browse the repository at this point in the history
  • Loading branch information
jschaff committed Apr 19, 2024
2 parents cf11bdb + 03c90fa commit 920b744
Show file tree
Hide file tree
Showing 15 changed files with 599 additions and 101 deletions.
1 change: 1 addition & 0 deletions requirements_hub.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
-e git+https://github.com/biothings/biothings.api.git@0.12.x#egg=biothings[hub]
biopython # refseq
pandas # umls
lxml # bs4 parsing
61 changes: 34 additions & 27 deletions src/config_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

# Refer to biothings.hub.default_config for all configurable settings

DATA_SRC_SERVER = 'localhost'
DATA_SRC_SERVER = "localhost"
DATA_SRC_PORT = 27017
DATA_SRC_DATABASE = 'mygene_src'
DATA_SRC_DATABASE = "mygene_src"

DATA_TARGET_SERVER = 'localhost'
DATA_TARGET_SERVER = "localhost"
DATA_TARGET_PORT = 27017
DATA_TARGET_DATABASE = 'mygene'
DATA_TARGET_DATABASE = "mygene"

HUB_DB_BACKEND = {
"module": "biothings.utils.mongo",
Expand All @@ -33,26 +33,28 @@
},
"env": {
"prod": {
"host": "<PRODSERVER>:9200",
"host": "http://<PRODSERVER>:9200",
"indexer": {
"args": {
"timeout": 300,
"retry_on_timeout": True,
"max_retries": 10,
},
},
"index": [{"index": "genedoc_mygene_allspecies_current", "doc_type": "gene"}]
"index": [
{"index": "genedoc_mygene_allspecies_current", "doc_type": "gene"}
],
},
"local": {
"host": "localhost:9200",
"host": "http://localhost:9200",
"indexer": {
"args": {
"timeout": 300,
"retry_on_timeout": True,
"max_retries": 10,
},
},
"index": [{"index": "mygene_gene_allspecies_current", "doc_type": "gene"}]
"index": [{"index": "mygene_gene_allspecies_current", "doc_type": "gene"}],
},
},
}
Expand All @@ -71,9 +73,9 @@
"name": "gene_repository-$(Y)",
"type": "s3",
"settings": {
"bucket": "<SNAPSHOT_BUCKET_NAME>",
"base_path": "mygene.info/$(Y)", # per year
"region": "us-west-2",
"bucket": "<SNAPSHOT_BUCKET_NAME>",
"base_path": "mygene.info/$(Y)", # per year
"region": "us-west-2",
},
"acl": "private",
},
Expand All @@ -95,9 +97,9 @@
"name": "gene_repository-demo-$(Y)",
"type": "s3",
"settings": {
"bucket": "<SNAPSHOT_DEMO_BUCKET_NAME>",
"base_path": "mygene.info/$(Y)", # per year
"region": "us-west-2",
"bucket": "<SNAPSHOT_DEMO_BUCKET_NAME>",
"base_path": "mygene.info/$(Y)", # per year
"region": "us-west-2",
},
"acl": "public",
},
Expand All @@ -108,7 +110,7 @@
# when creating a snapshot, how long should we wait before querying ES
# to check snapshot status/completion ? (in seconds)
"monitor_delay": 10,
}
},
}
}

Expand Down Expand Up @@ -153,7 +155,7 @@
"region": "us-west-2",
"auto": True, # automatically generate diff ? Careful if lots of changes
},
}
},
}
}

Expand All @@ -164,38 +166,38 @@
# Autohub configuration, either from a static definition...
STANDALONE_CONFIG = {
"_default": {
"es_host": "localhost:9200",
"es_host": "http://localhost:9200",
"index": "mygene_test",
"doc_type": "gene"
"doc_type": "gene",
},
"mygene.info": {
"es_host": "prodserver:9200",
"index": "mygene_prod",
"doc_type": "gene"
"doc_type": "gene",
},
}
# ... or using a dynamic indexer factory and ES host (index names are then
# taken from VERSION_URLS and all are managed on one given ES host)
#AUTOHUB_INDEXER_FACTORY = "biothings.hub.dataindex.indexer.DynamicIndexerFactory"
#AUTOHUB_ES_HOST = "localhost:9200"
# AUTOHUB_INDEXER_FACTORY = "biothings.hub.dataindex.indexer.DynamicIndexerFactory"
# AUTOHUB_ES_HOST = "localhost:9200"

# Autohub configuration, either from a static definition...
STANDALONE_CONFIG = {
"_default": {
"es_host": "localhost:9200",
"es_host": "http:/localhost:9200",
"index": "mygene_test",
"doc_type": "gene"
"doc_type": "gene",
},
"mygene.info": {
"es_host": "prodserver:9200",
"index": "mygene_prod",
"doc_type": "gene"
"doc_type": "gene",
},
}
# ... or using a dynamic indexer factory and ES host (index names are then
# taken from VERSION_URLS and all are managed on one given ES host)
#AUTOHUB_INDEXER_FACTORY = "biothings.hub.dataindex.indexer.DynamicIndexerFactory"
#AUTOHUB_ES_HOST = "localhost:9200"
# AUTOHUB_INDEXER_FACTORY = "biothings.hub.dataindex.indexer.DynamicIndexerFactory"
# AUTOHUB_ES_HOST = "localhost:9200"


########################################
Expand All @@ -221,5 +223,10 @@
"zebrafish": {"tax_id": "7955", "assembly": "zv9"},
"thale-cress": {"tax_id": "3702"},
"frog": {"tax_id": "8364", "assembly": "xenTro3"},
"pig": {"tax_id": "9823", "assembly": "susScr2"}
"pig": {"tax_id": "9823", "assembly": "susScr2"},
}

# for running tests locally in our biothings hub with testing api
APITEST_PATH = "data_tests"

APITEST_CONFIG = "config_web_local"
5 changes: 1 addition & 4 deletions src/config_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# *****************************************************************************
# Elasticsearch Settings
# *****************************************************************************
ES_HOST = "localhost:9200"
ES_HOST = "http://localhost:9200"
ES_INDEX = "mygene_current"
ES_DOC_TYPE = "gene"

Expand Down Expand Up @@ -196,6 +196,3 @@

# url template to redirect for 'include_tax_tree' parameter
INCLUDE_TAX_TREE_REDIRECT_ENDPOINT = "http://t.biothings.io/v1/taxon"

# for running tests locally in our biothings client
PYTEST_PATH = "tests/data_tests"
2 changes: 1 addition & 1 deletion src/hub/dataload/sources/clingen/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class ClingenUploader(biothings.hub.dataload.uploader.BaseSourceUploader):
}
}
idconverter = None
storage_class = biothings.hub.dataload.storage.IgnoreDuplicatedStorage
storage_class = biothings.utils.storage.IgnoreDuplicatedStorage

def load_data(self, data_folder):
self.logger.info("Load data from directory: '%s'" % data_folder)
Expand Down
18 changes: 15 additions & 3 deletions src/hub/dataload/sources/reactome/dump.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import datetime
import os
import sys
import time

import biothings
import requests

import os, sys, time, datetime
import config

import biothings, config
biothings.config_for_app(config)

from config import DATA_ARCHIVE_ROOT
from biothings.hub.dataload.dumper import LastModifiedHTTPDumper
from biothings.utils.common import unzipall

from config import DATA_ARCHIVE_ROOT


class ReactomeDumper(LastModifiedHTTPDumper):

Expand All @@ -18,3 +24,9 @@ class ReactomeDumper(LastModifiedHTTPDumper):
SRC_URLS = ["https://reactome.org/download/current/NCBI2Reactome_All_Levels.txt"]
SCHEDULE = "0 6 * * *"

def set_release(self):
self.release = str(
requests.get(
"https://reactome.org/ContentService/data/database/version"
).json()
)
70 changes: 62 additions & 8 deletions src/hub/dataload/sources/umls/dump.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,71 @@
import datetime
import os
import os.path
import sys
import time

import biothings, config
biothings.config_for_app(config)
import bs4
import dateutil.parser as dtparser
from biothings.hub.dataload.dumper import DumperException, HTTPDumper
from biothings.utils.common import unzipall

from config import DATA_ARCHIVE_ROOT
from biothings.hub.dataload.dumper import ManualDumper
from biothings.utils.common import unzipall


class UMLSDumper(ManualDumper):
class UMLSDumper(HTTPDumper):

SRC_NAME = "umls"
SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)

SCHEDULE = "0 12 * * *"
HOMEPAGE_URL = "https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html"

def get_latest_release(self):
res = self.client.get(self.__class__.HOMEPAGE_URL)
# Raise error if status is not 200
res.raise_for_status()
html = bs4.BeautifulSoup(res.text, "lxml")
# Get the table of metathesaurus release files
table = html.find(
"table", attrs={"class": "usa-table border-base-lighter margin-bottom-4"}
)
rows = table.find_all("tr")
# The header of the fifth column should be 'Date'
assert (
rows[0].find_all("th")[4].text.strip() == "Date"
), "Could not parse version from html table."
version = rows[1].find_all("td")[4].text
try:
latest = datetime.date.strftime(dtparser.parse(version), "%Y-%m-%d")
return latest
except Exception as e:
raise DumperException(
"Can't find or parse date from table field {}: {}" % (version, e)
)

def create_todump_list(self, force=True):
self.release = self.get_latest_release()
if (
force
or not self.src_doc
or (
self.src_doc
and self.src_doc.get("download", {}).get("release") < self.release
)
):
self.logger.info(
"Manually download from: https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html"
)
# Create data folder
local = os.path.join(self.SRC_ROOT_FOLDER, self.release)
if not os.path.exists(local):
os.makedirs(local)
# Dump a dummy file, to mark dump as successful and trigger uploader
release_notes = "https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/notes.html"
self.to_dump.append(
{
"remote": release_notes,
"local": os.path.join(local, "release_notes.html"),
}
)

def post_dump(self, *args, **kwargs):
self.logger.info("Unzipping files in '%s'" % self.new_data_folder)
unzipall(self.new_data_folder)
Loading

0 comments on commit 920b744

Please sign in to comment.