本文整理汇总了Python中dipper.models.Dataset.Dataset.setFileAccessUrl方法的典型用法代码示例。如果您正苦于以下问题:Python Dataset.setFileAccessUrl方法的具体用法?Python Dataset.setFileAccessUrl怎么用?Python Dataset.setFileAccessUrl使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.models.Dataset.Dataset
的用法示例。
在下文中一共展示了Dataset.setFileAccessUrl方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: EOM
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 别名]
class EOM(PostgreSQLSource):
"""
Elements of Morphology is a resource from NHGRI that has definitions of
morphological abnormalities, together with image depictions.
We pull those relationships, as well as our local mapping of equivalences
between EOM and HP terminologies.
The website is crawled monthly by NIF's DISCO crawler system,
which we utilize here.
Be sure to have pg user/password connection details in your conf.json file,
like:
dbauth : {
'disco' : {'user' : '<username>', 'password' : '<password>'}
}
Monarch-curated data for the HP to EOM mapping is stored at
https://phenotype-ontologies.googlecode.com
Since this resource is so small, the entirety of it is the "test" set.
"""
# we are using the production view here; should we be using services?
tables = [
'dvp.pr_nlx_157874_1'
]
files = {
'map': {
'file': 'hp-to-eom-mapping.tsv',
'url': 'https://phenotype-ontologies.googlecode.com/svn/trunk/src/ontology/hp/mappings/hp-to-eom-mapping.tsv'
}
}
def __init__(self):
super().__init__('eom')
self.namespaces.update(curie_map.get())
# update the dataset object with details about this resource
# TODO put this into a conf file?
self.dataset = Dataset(
'eom', 'EOM', 'http://elementsofmorphology.nih.gov', None,
'http://www.genome.gov/copyright.cfm',
'https://creativecommons.org/publicdomain/mark/1.0/')
# check if config exists; if it doesn't, error out and let user know
if 'dbauth' not in config.get_config() or \
'disco' not in config.get_config()['dbauth']:
logger.error("not configured with PG user/password.")
# source-specific warnings. will be cleared when resolved.
return
def fetch(self, is_dl_forced=False):
'''create the connection details for DISCO'''
cxn = config.get_config()['dbauth']['disco']
cxn.update(
{'host': 'nif-db.crbs.ucsd.edu', 'database': 'disco_crawler',
'port': 5432})
self.dataset.setFileAccessUrl(
''.join(('jdbc:postgresql://', cxn['host'], ':', str(cxn['port']),
'/', cxn['database'])))
# process the tables
# self.fetch_from_pgdb(self.tables,cxn,100) #for testing
self.fetch_from_pgdb(self.tables, cxn)
self.get_files(is_dl_forced)
# FIXME: Everything needed for data provenance?
st = os.stat('/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')))
filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
self.dataset.setVersion(filedate)
return
def parse(self, limit=None):
'''
Over ride Source.parse inherited via PostgreSQLSource
'''
if limit is not None:
logger.info("Only parsing first %s rows of each file", limit)
if self.testOnly:
self.testMode = True
logger.info("Parsing files...")
self._process_nlx_157874_1_view('/'.join((self.rawdir,
'dvp.pr_nlx_157874_1')),
limit)
self._map_eom_terms('/'.join((self.rawdir, self.files['map']['file'])),
limit)
logger.info("Finished parsing.")
#.........这里部分代码省略.........
示例2: GeneReviews
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 别名]
#.........这里部分代码省略.........
# figure out if the book is there; if so, process, otherwise skip
book_dir = '/'.join((self.rawdir, 'books'))
book_files = os.listdir(book_dir)
if ''.join((nbk, '.html')) not in book_files:
# logger.warning("No book found locally for %s; skipping", nbk)
books_not_found.add(nbk)
continue
logger.info("Processing %s", nbk)
page = open(url)
soup = BeautifulSoup(page.read())
# sec0 == clinical description
clin_summary = \
soup.find(
'div', id=re.compile(".*Summary.sec0"))
if clin_summary is not None:
p = clin_summary.find('p')
ptext = p.text
ptext = re.sub(r'\s+', ' ', ptext)
ul = clin_summary.find('ul')
if ul is not None:
item_text = list()
for li in ul.find_all('li'):
item_text.append(re.sub(r'\s+', ' ', li.text))
ptext += ' '.join(item_text)
# add in the copyright and citation info to description
ptext = \
' '.join(
(ptext,
'[GeneReviews:NBK1116, GeneReviews:NBK138602, ' +
nbk_id+']'))
self.gu.addDefinition(self.graph, nbk_id, ptext.strip())
# get the pubs
pmid_set = set()
pub_div = soup.find('div', id=re.compile(r".*Literature_Cited"))
if pub_div is not None:
ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"})
for r in ref_list:
for a in r.find_all(
'a', attrs={'href': re.compile(r"pubmed")}):
if re.match(r'PubMed:', a.text):
pmnum = re.sub(r'PubMed:\s*', '', a.text)
else:
pmnum = \
re.search(
r'\/pubmed\/(\d+)$', a['href']).group(1)
if pmnum is not None:
pmid = 'PMID:'+str(pmnum)
self.gu.addTriple(
self.graph, pmid,
self.gu.object_properties['is_about'],
nbk_id)
pmid_set.add(pmnum)
r = Reference(
pmid, Reference.ref_types['journal_article'])
r.addRefToGraph(self.graph)
# TODO add author history, copyright, license to dataset
# TODO get PMID-NBKID equivalence (near foot of page),
# and make it "is about" link
# self.gu.addTriple(
# self.graph, pmid,
# self.gu.object_properties['is_about'], nbk_id)
# for example: NBK1191 PMID:20301370
# add the book to the dataset
self.dataset.setFileAccessUrl(book_item['url'])
if limit is not None and c > limit:
break
# finish looping through books
l = len(books_not_found)
if len(books_not_found) > 0:
if l > 100:
logger.warning("There were %d books not found.", l)
else:
logger.warning(
"The following %d books were not found locally: %s",
l, str(books_not_found))
logger.info(
"Finished processing %d books for clinical descriptions", c-l)
return
def getTestSuite(self):
import unittest
from tests.test_genereviews import GeneReviewsTestCase
test_suite = \
unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase)
return test_suite
示例3: Coriell
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 别名]
#.........这里部分代码省略.........
# we rename (for simplicity) the original file
st = None
if os.path.exists(target_name):
st = os.stat(target_name)
logger.info(
"Local file date: %s",
datetime.utcfromtimestamp(st[stat.ST_CTIME]))
if st is None or remotef.st_mtime > st[stat.ST_CTIME]:
if st is None:
logger.info(
"File does not exist locally; downloading...")
else:
logger.info(
"There's a new version of %s catalog available; "
"downloading...", r)
sftp.get(remotef.filename, target_name)
logger.info(
"Fetched remote %s -> %s",
remotef.filename, target_name)
st = os.stat(target_name)
filedate = \
datetime.utcfromtimestamp(
remotef.st_mtime).strftime("%Y-%m-%d")
logger.info(
"New file date: %s",
datetime.utcfromtimestamp(st[stat.ST_CTIME]))
else:
logger.info("File %s exists; using local copy", fname)
filedate = \
datetime.utcfromtimestamp(
st[stat.ST_CTIME]).strftime("%Y-%m-%d")
self.dataset.setFileAccessUrl(remotef.filename)
self.dataset.setVersion(filedate)
return
def parse(self, limit=None):
if limit is not None:
logger.info("Only parsing first %s rows of each file", limit)
logger.info("Parsing files...")
if self.testOnly:
self.testMode = True
for f in self.files:
file = '/'.join((self.rawdir, self.files[f]['file']))
self._process_collection(
self.files[f]['id'],
self.files[f]['label'],
self.files[f]['page'])
self._process_data(file, limit)
logger.info("Finished parsing.")
self.load_bindings()
logger.info("Found %d nodes in graph", len(self.graph))
logger.info("Found %d nodes in testgraph", len(self.testgraph))
return
def _process_data(self, raw, limit=None):
"""
This function will process the data files from Coriell.
示例4: fetch
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 别名]
#.........这里部分代码省略.........
LOG.warning("New Remote File exists but it is SMALLER")
return True
# filesize is a fairly imperfect metric here
LOG.info("New Remote fFle has same filesize--will not download")
elif fstat[ST_SIZE] != size:
LOG.info(
"Remote File is %i \t Local File is %i", size, fstat[ST_SIZE])
return True
return False
def get_files(self, is_dl_forced, files=None):
"""
Given a set of files for this source, it will go fetch them, and
set a default version by date. If you need to set the version number
by another method, then it can be set again.
:param is_dl_forced - boolean
:param files dict - override instance files dict
:return: None
"""
fstat = None
if files is None:
files = self.files
for fname in files:
headers = None
filesource = files[fname]
if 'headers' in filesource:
headers = filesource['headers']
LOG.info("Getting %s", fname)
# if the key 'clean' exists in the sources `files` dict
# expose that instead of the longer url
if 'clean' in filesource and filesource['clean'] is not None:
self.dataset.setFileAccessUrl(filesource['clean'])
else:
self.dataset.setFileAccessUrl(filesource['url'])
LOG.info('Fetching %s', filesource['url'])
self.fetch_from_url(
filesource['url'], '/'.join((self.rawdir, filesource['file'])),
is_dl_forced, headers)
fstat = os.stat('/'.join((self.rawdir, filesource['file'])))
# only keeping the date from the last file
filedate = datetime.utcfromtimestamp(fstat[ST_CTIME]).strftime("%Y-%m-%d")
# FIXME
# change this so the date is attached only to each file, not the entire dataset
self.dataset.set_date_issued(filedate)
def fetch_from_url(
self, remotefile, localfile=None, is_dl_forced=False, headers=None):
"""
Given a remote url and a local filename, attempt to determine
if the remote file is newer; if it is,
fetch the remote file and save it to the specified localfile,
reporting the basic file information once it is downloaded
:param remotefile: URL of remote file to fetch
:param localfile: pathname of file to save locally
:return: None
"""
response = None
if ((is_dl_forced is True) or localfile is None or