本文整理汇总了Python中dipper.models.Dataset.Dataset.setVersion方法的典型用法代码示例。如果您正苦于以下问题:Python Dataset.setVersion方法的具体用法?Python Dataset.setVersion怎么用?Python Dataset.setVersion使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.models.Dataset.Dataset
的用法示例。
在下文中一共展示了Dataset.setVersion方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: EOM
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setVersion [as 别名]
class EOM(PostgreSQLSource):
"""
Elements of Morphology is a resource from NHGRI that has definitions of
morphological abnormalities, together with image depictions.
We pull those relationships, as well as our local mapping of equivalences
between EOM and HP terminologies.
The website is crawled monthly by NIF's DISCO crawler system,
which we utilize here.
Be sure to have pg user/password connection details in your conf.json file,
like:
dbauth : {
'disco' : {'user' : '<username>', 'password' : '<password>'}
}
Monarch-curated data for the HP to EOM mapping is stored at
https://phenotype-ontologies.googlecode.com
Since this resource is so small, the entirety of it is the "test" set.
"""
# we are using the production view here; should we be using services?
tables = [
'dvp.pr_nlx_157874_1'
]
files = {
'map': {
'file': 'hp-to-eom-mapping.tsv',
'url': 'https://phenotype-ontologies.googlecode.com/svn/trunk/src/ontology/hp/mappings/hp-to-eom-mapping.tsv'
}
}
def __init__(self):
super().__init__('eom')
self.namespaces.update(curie_map.get())
# update the dataset object with details about this resource
# TODO put this into a conf file?
self.dataset = Dataset(
'eom', 'EOM', 'http://elementsofmorphology.nih.gov', None,
'http://www.genome.gov/copyright.cfm',
'https://creativecommons.org/publicdomain/mark/1.0/')
# check if config exists; if it doesn't, error out and let user know
if 'dbauth' not in config.get_config() or \
'disco' not in config.get_config()['dbauth']:
logger.error("not configured with PG user/password.")
# source-specific warnings. will be cleared when resolved.
return
def fetch(self, is_dl_forced=False):
'''create the connection details for DISCO'''
cxn = config.get_config()['dbauth']['disco']
cxn.update(
{'host': 'nif-db.crbs.ucsd.edu', 'database': 'disco_crawler',
'port': 5432})
self.dataset.setFileAccessUrl(
''.join(('jdbc:postgresql://', cxn['host'], ':', str(cxn['port']),
'/', cxn['database'])))
# process the tables
# self.fetch_from_pgdb(self.tables,cxn,100) #for testing
self.fetch_from_pgdb(self.tables, cxn)
self.get_files(is_dl_forced)
# FIXME: Everything needed for data provenance?
st = os.stat('/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')))
filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
self.dataset.setVersion(filedate)
return
def parse(self, limit=None):
'''
Over ride Source.parse inherited via PostgreSQLSource
'''
if limit is not None:
logger.info("Only parsing first %s rows of each file", limit)
if self.testOnly:
self.testMode = True
logger.info("Parsing files...")
self._process_nlx_157874_1_view('/'.join((self.rawdir,
'dvp.pr_nlx_157874_1')),
limit)
self._map_eom_terms('/'.join((self.rawdir, self.files['map']['file'])),
limit)
logger.info("Finished parsing.")
#.........这里部分代码省略.........
示例2: CTD
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setVersion [as 别名]
#.........这里部分代码省略.........
limit, self.files['chemical_disease_interactions']['file'])
self._parse_ctd_file(limit, self.files['gene_pathway']['file'])
self._parse_ctd_file(limit, self.files['gene_disease']['file'])
self._parse_curated_chem_disease(limit)
logger.info("Done parsing files.")
return
def _parse_ctd_file(self, limit, file):
"""
Parses files in CTD.files dictionary
Args:
:param limit (int): limit the number of rows processed
:param file (str): file name (must be defined in CTD.file)
Returns:
:return None
"""
row_count = 0
version_pattern = re.compile(r'^# Report created: (.+)$')
is_versioned = False
file_path = '/'.join((self.rawdir, file))
with gzip.open(file_path, 'rt') as tsvfile:
reader = csv.reader(tsvfile, delimiter="\t")
for row in reader:
# Scan the header lines until we get the version
# There is no official version sp we are using
# the upload timestamp instead
if is_versioned is False:
match = re.match(version_pattern, ' '.join(row))
if match:
version = re.sub(r'\s|:', '-', match.group(1))
# TODO convert this timestamp to a proper timestamp
self.dataset.setVersion(version)
is_versioned = True
elif re.match(r'^#', ' '.join(row)):
pass
else:
row_count += 1
if file == self.files[
'chemical_disease_interactions']['file']:
self._process_interactions(row)
elif file == self.files['gene_pathway']['file']:
self._process_pathway(row)
elif file == self.files['gene_disease']['file']:
self._process_disease2gene(row)
if not self.testMode and \
limit is not None and row_count >= limit:
break
return
def _process_pathway(self, row):
"""
Process row of CTD data from CTD_genes_pathways.tsv.gz
and generate triples
Args:
:param row (list): row of CTD data
Returns:
:return None
"""
model = Model(self.g)
self._check_list_len(row, 4)
(gene_symbol, gene_id, pathway_name, pathway_id) = row
示例3: Coriell
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setVersion [as 别名]
#.........这里部分代码省略.........
st = None
if os.path.exists(target_name):
st = os.stat(target_name)
logger.info(
"Local file date: %s",
datetime.utcfromtimestamp(st[stat.ST_CTIME]))
if st is None or remotef.st_mtime > st[stat.ST_CTIME]:
if st is None:
logger.info(
"File does not exist locally; downloading...")
else:
logger.info(
"There's a new version of %s catalog available; "
"downloading...", r)
sftp.get(remotef.filename, target_name)
logger.info(
"Fetched remote %s -> %s",
remotef.filename, target_name)
st = os.stat(target_name)
filedate = \
datetime.utcfromtimestamp(
remotef.st_mtime).strftime("%Y-%m-%d")
logger.info(
"New file date: %s",
datetime.utcfromtimestamp(st[stat.ST_CTIME]))
else:
logger.info("File %s exists; using local copy", fname)
filedate = \
datetime.utcfromtimestamp(
st[stat.ST_CTIME]).strftime("%Y-%m-%d")
self.dataset.setFileAccessUrl(remotef.filename)
self.dataset.setVersion(filedate)
return
def parse(self, limit=None):
if limit is not None:
logger.info("Only parsing first %s rows of each file", limit)
logger.info("Parsing files...")
if self.testOnly:
self.testMode = True
for f in self.files:
file = '/'.join((self.rawdir, self.files[f]['file']))
self._process_collection(
self.files[f]['id'],
self.files[f]['label'],
self.files[f]['page'])
self._process_data(file, limit)
logger.info("Finished parsing.")
self.load_bindings()
logger.info("Found %d nodes in graph", len(self.graph))
logger.info("Found %d nodes in testgraph", len(self.testgraph))
return
def _process_data(self, raw, limit=None):
"""
This function will process the data files from Coriell.
We make the assumption that any alleles listed are variants
示例4: MMRRC
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setVersion [as 别名]
class MMRRC(Source):
"""
Here we process the Mutant Mouse Resource and Research Center
(https://www.mmrrc.org) strain data,
which includes:
* strains, their mutant alleles
* phenotypes of the alleles
* descriptions of the research uses of the strains
Note that some gene identifiers are not included
(for many of the transgenics with human genes) in the raw data.
We do our best to process the links between the variant and
the affected gene, but sometimes the mapping is not clear,
and we do not include it.
Many of these details will be solved by merging this source with
the MGI data source, who has the variant-to-gene designations.
Also note that even though the strain pages at the MMRRC site do list
phenotypic differences in the context of the strain backgrounds,
they do not provide that data to us,
and thus we cannot supply that disambiguation here.
"""
files = {
'catalog': {
'file': 'mmrrc_catalog_data.csv',
'url': 'https://www.mmrrc.org/about/mmrrc_catalog_data.csv'},
}
test_ids = [
'MMRRC:037507-MU', 'MMRRC:041175-UCD', 'MMRRC:036933-UNC',
'MMRRC:037884-UCD', 'MMRRC:000255-MU', 'MMRRC:037372-UCD',
'MMRRC:000001-UNC'
]
def __init__(self):
Source.__init__(self, 'mmrrc')
self.strain_hash = {}
self.id_label_hash = {}
self.load_bindings()
self.dataset = Dataset(
'mmrrc', 'Mutant Mouse Regional Resource Centers',
'https://www.mmrrc.org', None,
'https://www.mmrrc.org/about/data_download.php')
return
def fetch(self, is_dl_forced=False):
self.get_files(is_dl_forced)
fname = '/'.join((self.rawdir, self.files['catalog']['file']))
st = os.stat(fname)
filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
# TODO note: can set the data version to what is in the header
# first line like:
# This MMRRC catalog data file was generated on 2015-04-22
self.dataset.setVersion(filedate)
return
def parse(self, limit=None):
if limit is not None:
logger.info("Only parsing first %s rows", limit)
logger.info("Parsing files...")
if self.testOnly:
self.testMode = True
self._process_phenotype_data(limit)
logger.info("Finished parsing.")
return
def _process_phenotype_data(self, limit):
"""
NOTE: If a Strain carries more than one mutation,
then each Mutation description,
i.e., the set: (
Mutation Type - Chromosome - Gene Symbol -
Gene Name - Allele Symbol - Allele Name)
will require a separate line.
Note that MMRRC curates phenotypes to alleles,
even though they distribute only one file with the
phenotypes appearing to be associated with a strain.
So, here we process the allele-to-phenotype relationships separately
from the strain-to-allele relationships.
:param limit:
:return:
"""
if self.testMode:
g = self.testgraph
else:
#.........这里部分代码省略.........
示例5: HPOAnnotations
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setVersion [as 别名]
class HPOAnnotations(Source):
"""
The [Human Phenotype Ontology](http://human-phenotype-ontology.org) group curates and assembles
over 115,000 annotations to hereditary diseases using the HPO ontology.
Here we create OBAN-style associations between diseases and phenotypic features, together with their
evidence, and age of onset and frequency (if known).
The parser currently only processes the "abnormal" annotations. Association to "remarkable normality"
will be added in the near future.
In order to properly test this class, you should have a conf.json file configured with some test ids, in
the structure of:
<pre>
test_ids: {
"disease" : ["OMIM:119600", "OMIM:120160"] # as examples. put your favorite ids in the config.
}
</pre>
"""
files = {
'annot': {'file' : 'phenotype_annotation.tab',
'url' : 'http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/misc/phenotype_annotation.tab'},
'version': {'file' : 'data_version.txt',
'url' : 'http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/misc/data_version.txt'},
# 'neg_annot': {'file' : 'phenotype_annotation.tab',
# 'url' : 'http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/misc/negative_phenotype_annotation.tab'
# },
}
# note, two of these codes are awaiting term requests. see #114 and
# https://code.google.com/p/evidenceontology/issues/detail?id=32
eco_dict = {
"ICE": "ECO:0000305", # FIXME currently using "curator inference used in manual assertion"
"IEA": "ECO:0000501", # Inferred from Electronic Annotation
"PCS": "ECO:0000269", # FIXME currently using "experimental evidence used in manual assertion"
"TAS": "ECO:0000304" # Traceable Author Statement
}
def __init__(self):
Source.__init__(self, 'hpoa')
self.load_bindings()
self.dataset = Dataset('hpoa', 'Human Phenotype Ontology',
'http://www.human-phenotype-ontology.org', None,
'http://www.human-phenotype-ontology.org/contao/index.php/legal-issues.html')
if 'test_ids' not in config.get_config() or 'disease' not in config.get_config()['test_ids']:
logger.warn("not configured with disease test ids.")
else:
self.test_ids = config.get_config()['test_ids']['disease']
# data-source specific warnings (will be removed when issues are cleared)
logger.warn("note that some ECO classes are missing for ICE and PCS; using temporary mappings.")
return
def fetch(self, is_dl_forced=False):
self.get_files(is_dl_forced)
self.scrub()
# get the latest build from jenkins
# NOT DOING THIS ANY MORE - but leaving it in for reference
# jenkins_info = eval(urllib.request.urlopen('http://compbio.charite.de/hudson/job/hpo.annotations/lastSuccessfulBuild/api/python').read())
# version = jenkins_info['number']
# use the files['version'] file as the version
fname = '/'.join((self.rawdir, self.files['version']['file']))
with open(fname, 'r', encoding="utf8") as f:
# 2015-04-23 13:01
v = f.readline() # read the first line (the only line, really)
d = datetime.strptime(v.strip(), '%Y-%m-%d %H:%M').strftime("%Y-%m-%d-%H-%M")
f.close()
st = os.stat(fname)
filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
# this will cause two dates to be attached to the dataset (one from the filedate, and the other from here)
# TODO when #112 is implemented, this will result in only the whole dataset being versioned
self.dataset.setVersion(filedate, d)
return
def scrub(self):
"""
Perform various data-scrubbing on the raw data files prior to parsing.
For this resource, this currently includes:
* revise errors in identifiers for some OMIM and PMIDs
:return: None
"""
# scrub file of the oddities...lots of publication rewriting
f = '/'.join((self.rawdir, self.files['annot']['file']))
logger.info('scrubbing PubMed:12345 --> PMID:12345')
pysed.replace("PubMed", 'PMID', f)
logger.info('scrubbing pmid:12345 --> PMID:12345')
pysed.replace("pmid", 'PMID', f)
#.........这里部分代码省略.........
示例6: BioGrid
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setVersion [as 别名]
class BioGrid(Source):
"""
Biogrid interaction data
"""
# TODO write up class summary for docstring
files = {
'interactions': {
'file': 'interactions.mitab.zip',
'url': BGDL + '/BIOGRID-ALL-LATEST.mitab.zip'},
'identifiers': {
'file': 'identifiers.tab.zip',
'url': BGDL + '/BIOGRID-IDENTIFIERS-LATEST.tab.zip'}
}
# biogrid-specific identifiers for use in subsetting identifier mapping
biogrid_ids = [
106638, 107308, 107506, 107674, 107675, 108277, 108506, 108767, 108814,
108899, 110308, 110364, 110678, 111642, 112300, 112365, 112771, 112898,
199832, 203220, 247276, 120150, 120160, 124085]
def __init__(self, tax_ids=None):
super().__init__('biogrid')
self.tax_ids = tax_ids
self.load_bindings()
self.dataset = Dataset(
'biogrid', 'The BioGrid', 'http://thebiogrid.org/', None,
'http://wiki.thebiogrid.org/doku.php/terms_and_conditions')
# Defaults
# our favorite animals
# taxids = [9606,10090,10116,7227,7955,6239,8355]
if self.tax_ids is None:
self.tax_ids = [9606, 10090, 7955]
if 'test_ids' not in config.get_config() or \
'gene' not in config.get_config()['test_ids']:
logger.warning("not configured with gene test ids.")
else:
self.test_ids = config.get_config()['test_ids']['gene']
# data-source specific warnings
# (will be removed when issues are cleared)
logger.warning(
"several MI experimental codes do not exactly map to ECO; "
"using approximations.")
return
def fetch(self, is_dl_forced=False):
"""
:param is_dl_forced:
:return: None
"""
self.get_files(is_dl_forced)
# the version number is encoded in the filename in the zip.
# for example, the interactions file may unzip to
# BIOGRID-ALL-3.2.119.mitab.txt, where the version number is 3.2.119
f = '/'.join((self.rawdir, self.files['interactions']['file']))
st = os.stat(f)
filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
with ZipFile(f, 'r') as myzip:
flist = myzip.namelist()
# assume that the first entry is the item
fname = flist[0]
# get the version from the filename
version = \
re.match(r'BIOGRID-ALL-(\d+\.\d+\.\d+)\.mitab.txt', fname)
myzip.close()
self.dataset.setVersion(filedate, str(version.groups()[0]))
return
def parse(self, limit=None):
"""
:param limit:
:return:
"""
if self.testOnly:
self.testMode = True
self._get_interactions(limit)
self._get_identifiers(limit)
self.load_bindings()
logger.info("Loaded %d test graph nodes", len(self.testgraph))
logger.info("Loaded %d full graph nodes", len(self.graph))
return
def _get_interactions(self, limit):
#.........这里部分代码省略.........
示例7: HPOAnnotations
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setVersion [as 别名]
#.........这里部分代码省略.........
# data-source specific warnings to be removed when issues are cleared
logger.warning(
"note that some ECO classes are missing for ICE, PCS, and ITM;" +
" using temporary mappings.")
return
def fetch(self, is_dl_forced=False):
self.get_files(is_dl_forced)
self.scrub()
# get the latest build from jenkins
# use the files['version'] file as the version
fname = '/'.join((self.rawdir, self.files['version']['file']))
with open(fname, 'r', encoding="utf8") as f:
# 2015-04-23 13:01
v = f.readline() # read the first line (the only line, really)
d = datetime.strptime(
v.strip(), '%Y-%m-%d %H:%M').strftime("%Y-%m-%d-%H-%M")
f.close()
st = os.stat(fname)
filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
# this will cause two dates to be attached to the dataset
# (one from the filedate, and the other from here)
# TODO when #112 is implemented,
# this will result in only the whole dataset being versioned
self.dataset.setVersion(filedate, d)
self.get_common_files()
return
def scrub(self):
"""
Perform various data-scrubbing on the raw data files prior to parsing.
For this resource, this currently includes:
* revise errors in identifiers for some OMIM and PMIDs
:return: None
"""
# scrub file of the oddities...lots of publication rewriting
f = '/'.join((self.rawdir, self.files['annot']['file']))
logger.info('scrubbing PubMed:12345 --> PMID:12345')
pysed.replace(r'PubMed:', 'PMID:', f)
logger.info('scrubbing pmid:12345 --> PMID:12345')
pysed.replace(r'pmid:', 'PMID:', f)
logger.info('scrubbing PMID: 12345 --> PMID:12345')
pysed.replace(r'PMID: *', 'PMID:', f)
logger.info('scrubbing PMID12345 --> PMID:12345')
pysed.replace(r'PMID([0-9][0-9]*)', r'PMID:\1', f)
logger.info('scrubbing MIM12345 --> OMIM:12345')
pysed.replace(r'MIM([0-9][0-9]*)', r'OMIM:\1', f)
示例8: BioGrid
# 需要导入模块: from dipper.models.Dataset import Dataset [as 别名]
# 或者: from dipper.models.Dataset.Dataset import setVersion [as 别名]
class BioGrid(Source):
"""
Biogrid interaction data
"""
# TODO write up class summary for docstring
files = {
"interactions": {
"file": "interactions.mitab.zip",
"url": "http://thebiogrid.org/downloads/archives/Latest%20Release/BIOGRID-ALL-LATEST.mitab.zip",
},
"identifiers": {
"file": "identifiers.tab.zip",
"url": "http://thebiogrid.org/downloads/archives/Latest%20Release/BIOGRID-IDENTIFIERS-LATEST.tab.zip",
},
}
# biogrid-specific identifiers for use in subsetting identifier mapping
biogrid_ids = [
106638,
107308,
107506,
107674,
107675,
108277,
108506,
108767,
108814,
108899,
110308,
110364,
110678,
111642,
112300,
112365,
112771,
112898,
199832,
203220,
247276,
120150,
120160,
124085,
]
def __init__(self, tax_ids=None):
super().__init__("biogrid")
self.tax_ids = tax_ids
self.load_bindings()
self.dataset = Dataset(
"biogrid",
"The BioGrid",
"http://thebiogrid.org/",
None,
"http://wiki.thebiogrid.org/doku.php/terms_and_conditions",
)
# Defaults
# taxids = [9606,10090,10116,7227,7955,6239,8355] #our favorite animals
if self.tax_ids is None:
self.tax_ids = [9606, 10090, 7955]
if "test_ids" not in config.get_config() or "gene" not in config.get_config()["test_ids"]:
logger.warn("not configured with gene test ids.")
else:
self.test_ids = config.get_config()["test_ids"]["gene"]
# data-source specific warnings (will be removed when issues are cleared)
logger.warn("several MI experimental codes do not exactly map to ECO; using approximations.")
return
def fetch(self, is_dl_forced=False):
"""
:param is_dl_forced:
:return: None
"""
self.get_files(is_dl_forced)
# the version number is encoded in the filename in the zip.
# for example, the interactions file may unzip to BIOGRID-ALL-3.2.119.mitab.txt,
# where the version number is 3.2.119
f = "/".join((self.rawdir, self.files["interactions"]["file"]))
st = os.stat(f)
filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d")
with ZipFile(f, "r") as myzip:
flist = myzip.namelist()
# assume that the first entry is the item
fname = flist[0]
# get the version from the filename
version = re.match("BIOGRID-ALL-(\d+\.\d+\.\d+)\.mitab.txt", fname)
myzip.close()
self.dataset.setVersion(filedate, str(version.groups()[0]))
return
#.........这里部分代码省略.........