本文整理匯總了Python中dipper.models.Dataset.Dataset.setFileAccessUrl方法的典型用法代碼示例。如果您正苦於以下問題:Python Dataset.setFileAccessUrl方法的具體用法?Python Dataset.setFileAccessUrl怎麽用?Python Dataset.setFileAccessUrl使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類dipper.models.Dataset.Dataset
的用法示例。
在下文中一共展示了Dataset.setFileAccessUrl方法的4個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: EOM
# 需要導入模塊: from dipper.models.Dataset import Dataset [as 別名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 別名]
class EOM(PostgreSQLSource):
"""
Elements of Morphology is a resource from NHGRI that has definitions of
morphological abnormalities, together with image depictions.
We pull those relationships, as well as our local mapping of equivalences
between EOM and HP terminologies.
The website is crawled monthly by NIF's DISCO crawler system,
which we utilize here.
Be sure to have pg user/password connection details in your conf.json file,
like:
dbauth : {
'disco' : {'user' : '<username>', 'password' : '<password>'}
}
Monarch-curated data for the HP to EOM mapping is stored at
https://phenotype-ontologies.googlecode.com
Since this resource is so small, the entirety of it is the "test" set.
"""
# we are using the production view here; should we be using services?
tables = [
'dvp.pr_nlx_157874_1'
]
files = {
'map': {
'file': 'hp-to-eom-mapping.tsv',
'url': 'https://phenotype-ontologies.googlecode.com/svn/trunk/src/ontology/hp/mappings/hp-to-eom-mapping.tsv'
}
}
def __init__(self):
super().__init__('eom')
self.namespaces.update(curie_map.get())
# update the dataset object with details about this resource
# TODO put this into a conf file?
self.dataset = Dataset(
'eom', 'EOM', 'http://elementsofmorphology.nih.gov', None,
'http://www.genome.gov/copyright.cfm',
'https://creativecommons.org/publicdomain/mark/1.0/')
# check if config exists; if it doesn't, error out and let user know
if 'dbauth' not in config.get_config() or \
'disco' not in config.get_config()['dbauth']:
logger.error("not configured with PG user/password.")
# source-specific warnings. will be cleared when resolved.
return
def fetch(self, is_dl_forced=False):
'''create the connection details for DISCO'''
cxn = config.get_config()['dbauth']['disco']
cxn.update(
{'host': 'nif-db.crbs.ucsd.edu', 'database': 'disco_crawler',
'port': 5432})
self.dataset.setFileAccessUrl(
''.join(('jdbc:postgresql://', cxn['host'], ':', str(cxn['port']),
'/', cxn['database'])))
# process the tables
# self.fetch_from_pgdb(self.tables,cxn,100) #for testing
self.fetch_from_pgdb(self.tables, cxn)
self.get_files(is_dl_forced)
# FIXME: Everything needed for data provenance?
st = os.stat('/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')))
filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
self.dataset.setVersion(filedate)
return
def parse(self, limit=None):
'''
Over ride Source.parse inherited via PostgreSQLSource
'''
if limit is not None:
logger.info("Only parsing first %s rows of each file", limit)
if self.testOnly:
self.testMode = True
logger.info("Parsing files...")
self._process_nlx_157874_1_view('/'.join((self.rawdir,
'dvp.pr_nlx_157874_1')),
limit)
self._map_eom_terms('/'.join((self.rawdir, self.files['map']['file'])),
limit)
logger.info("Finished parsing.")
#.........這裏部分代碼省略.........
示例2: GeneReviews
# 需要導入模塊: from dipper.models.Dataset import Dataset [as 別名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 別名]
#.........這裏部分代碼省略.........
# figure out if the book is there; if so, process, otherwise skip
book_dir = '/'.join((self.rawdir, 'books'))
book_files = os.listdir(book_dir)
if ''.join((nbk, '.html')) not in book_files:
# logger.warning("No book found locally for %s; skipping", nbk)
books_not_found.add(nbk)
continue
logger.info("Processing %s", nbk)
page = open(url)
soup = BeautifulSoup(page.read())
# sec0 == clinical description
clin_summary = \
soup.find(
'div', id=re.compile(".*Summary.sec0"))
if clin_summary is not None:
p = clin_summary.find('p')
ptext = p.text
ptext = re.sub(r'\s+', ' ', ptext)
ul = clin_summary.find('ul')
if ul is not None:
item_text = list()
for li in ul.find_all('li'):
item_text.append(re.sub(r'\s+', ' ', li.text))
ptext += ' '.join(item_text)
# add in the copyright and citation info to description
ptext = \
' '.join(
(ptext,
'[GeneReviews:NBK1116, GeneReviews:NBK138602, ' +
nbk_id+']'))
self.gu.addDefinition(self.graph, nbk_id, ptext.strip())
# get the pubs
pmid_set = set()
pub_div = soup.find('div', id=re.compile(r".*Literature_Cited"))
if pub_div is not None:
ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"})
for r in ref_list:
for a in r.find_all(
'a', attrs={'href': re.compile(r"pubmed")}):
if re.match(r'PubMed:', a.text):
pmnum = re.sub(r'PubMed:\s*', '', a.text)
else:
pmnum = \
re.search(
r'\/pubmed\/(\d+)$', a['href']).group(1)
if pmnum is not None:
pmid = 'PMID:'+str(pmnum)
self.gu.addTriple(
self.graph, pmid,
self.gu.object_properties['is_about'],
nbk_id)
pmid_set.add(pmnum)
r = Reference(
pmid, Reference.ref_types['journal_article'])
r.addRefToGraph(self.graph)
# TODO add author history, copyright, license to dataset
# TODO get PMID-NBKID equivalence (near foot of page),
# and make it "is about" link
# self.gu.addTriple(
# self.graph, pmid,
# self.gu.object_properties['is_about'], nbk_id)
# for example: NBK1191 PMID:20301370
# add the book to the dataset
self.dataset.setFileAccessUrl(book_item['url'])
if limit is not None and c > limit:
break
# finish looping through books
l = len(books_not_found)
if len(books_not_found) > 0:
if l > 100:
logger.warning("There were %d books not found.", l)
else:
logger.warning(
"The following %d books were not found locally: %s",
l, str(books_not_found))
logger.info(
"Finished processing %d books for clinical descriptions", c-l)
return
def getTestSuite(self):
import unittest
from tests.test_genereviews import GeneReviewsTestCase
test_suite = \
unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase)
return test_suite
示例3: Coriell
# 需要導入模塊: from dipper.models.Dataset import Dataset [as 別名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 別名]
#.........這裏部分代碼省略.........
# we rename (for simplicity) the original file
st = None
if os.path.exists(target_name):
st = os.stat(target_name)
logger.info(
"Local file date: %s",
datetime.utcfromtimestamp(st[stat.ST_CTIME]))
if st is None or remotef.st_mtime > st[stat.ST_CTIME]:
if st is None:
logger.info(
"File does not exist locally; downloading...")
else:
logger.info(
"There's a new version of %s catalog available; "
"downloading...", r)
sftp.get(remotef.filename, target_name)
logger.info(
"Fetched remote %s -> %s",
remotef.filename, target_name)
st = os.stat(target_name)
filedate = \
datetime.utcfromtimestamp(
remotef.st_mtime).strftime("%Y-%m-%d")
logger.info(
"New file date: %s",
datetime.utcfromtimestamp(st[stat.ST_CTIME]))
else:
logger.info("File %s exists; using local copy", fname)
filedate = \
datetime.utcfromtimestamp(
st[stat.ST_CTIME]).strftime("%Y-%m-%d")
self.dataset.setFileAccessUrl(remotef.filename)
self.dataset.setVersion(filedate)
return
def parse(self, limit=None):
if limit is not None:
logger.info("Only parsing first %s rows of each file", limit)
logger.info("Parsing files...")
if self.testOnly:
self.testMode = True
for f in self.files:
file = '/'.join((self.rawdir, self.files[f]['file']))
self._process_collection(
self.files[f]['id'],
self.files[f]['label'],
self.files[f]['page'])
self._process_data(file, limit)
logger.info("Finished parsing.")
self.load_bindings()
logger.info("Found %d nodes in graph", len(self.graph))
logger.info("Found %d nodes in testgraph", len(self.testgraph))
return
def _process_data(self, raw, limit=None):
"""
This function will process the data files from Coriell.
示例4: fetch
# 需要導入模塊: from dipper.models.Dataset import Dataset [as 別名]
# 或者: from dipper.models.Dataset.Dataset import setFileAccessUrl [as 別名]
#.........這裏部分代碼省略.........
LOG.warning("New Remote File exists but it is SMALLER")
return True
# filesize is a fairly imperfect metric here
LOG.info("New Remote fFle has same filesize--will not download")
elif fstat[ST_SIZE] != size:
LOG.info(
"Remote File is %i \t Local File is %i", size, fstat[ST_SIZE])
return True
return False
def get_files(self, is_dl_forced, files=None):
"""
Given a set of files for this source, it will go fetch them, and
set a default version by date. If you need to set the version number
by another method, then it can be set again.
:param is_dl_forced - boolean
:param files dict - override instance files dict
:return: None
"""
fstat = None
if files is None:
files = self.files
for fname in files:
headers = None
filesource = files[fname]
if 'headers' in filesource:
headers = filesource['headers']
LOG.info("Getting %s", fname)
# if the key 'clean' exists in the sources `files` dict
# expose that instead of the longer url
if 'clean' in filesource and filesource['clean'] is not None:
self.dataset.setFileAccessUrl(filesource['clean'])
else:
self.dataset.setFileAccessUrl(filesource['url'])
LOG.info('Fetching %s', filesource['url'])
self.fetch_from_url(
filesource['url'], '/'.join((self.rawdir, filesource['file'])),
is_dl_forced, headers)
fstat = os.stat('/'.join((self.rawdir, filesource['file'])))
# only keeping the date from the last file
filedate = datetime.utcfromtimestamp(fstat[ST_CTIME]).strftime("%Y-%m-%d")
# FIXME
# change this so the date is attached only to each file, not the entire dataset
self.dataset.set_date_issued(filedate)
def fetch_from_url(
self, remotefile, localfile=None, is_dl_forced=False, headers=None):
"""
Given a remote url and a local filename, attempt to determine
if the remote file is newer; if it is,
fetch the remote file and save it to the specified localfile,
reporting the basic file information once it is downloaded
:param remotefile: URL of remote file to fetch
:param localfile: pathname of file to save locally
:return: None
"""
response = None
if ((is_dl_forced is True) or localfile is None or