本文整理汇总了Python中dipper.utils.GraphUtils.GraphUtils类的典型用法代码示例。如果您正苦于以下问题:Python GraphUtils类的具体用法?Python GraphUtils怎么用?Python GraphUtils使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了GraphUtils类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
def parse(self, limit=None):
if limit is not None:
logger.info("Only parsing first %s rows of each file", limit)
if self.version_num is None:
import os
logger.info("Figuring out version num for files")
# probe the raw directory for the WSnumber on
# the "letter.WS###" file.
# this is the only one that we keep the version number on
files = os.listdir(self.rawdir)
letter_file = next(f for f in files if re.match(r'letter', f))
vernum = re.search(r'(WS\d+)', letter_file)
self.update_wsnum_in_files(vernum.group(1))
logger.info("Parsing files...")
if self.testOnly:
self.testMode = True
if self.testMode:
g = self.testgraph
else:
g = self.graph
self.nobnodes = True # FIXME
# to hold any label for a given id
self.id_label_map = {}
# to hold the mappings between genotype and background
self.genotype_backgrounds = {}
self.extrinsic_id_to_enviro_id_hash = {}
# to hold the genes variant due to a seq alt
self.variant_loci_genes = {}
# to hold the parts of an environment
self.environment_hash = {}
self.wildtype_genotypes = []
# stores the rnai_reagent to gene targets
self.rnai_gene_map = {}
self.process_gene_ids(limit)
# self.process_gene_desc(limit) #TEC imput file is mia 2016-Mar-03
self.process_allele_phenotype(limit)
self.process_rnai_phenotypes(limit)
self.process_pub_xrefs(limit)
self.process_feature_loc(limit)
self.process_disease_association(limit)
# TODO add this when when complete
# self.process_gene_interaction(limit)
logger.info("Finished parsing.")
self.load_bindings()
gu = GraphUtils(curie_map.get())
gu.loadAllProperties(g)
gu.loadObjectProperties(g, Genotype.object_properties)
logger.info("Found %d nodes in graph", len(self.graph))
logger.info("Found %d nodes in testgraph", len(self.testgraph))
return
示例2: _parse_curated_chem_disease
def _parse_curated_chem_disease(self, limit):
line_counter = 0
file_path = '/'.join((self.rawdir, self.static_files['publications']['file']))
gu = GraphUtils(curie_map.get())
with open(file_path, 'r') as tsvfile:
reader = csv.reader(tsvfile, delimiter="\t")
for row in reader:
# catch comment lines
if re.match('^#', ' '.join(row)):
continue
line_counter += 1
self._check_list_len(row, 10)
(pub_id, disease_label, disease_id, disease_cat, evidence,
chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row
rel_id = self._get_relationship_id(evidence)
chem_id = 'MESH:'+chem_id
gu.addClassToGraph(self.g, chem_id, chem_label)
gu.addClassToGraph(self.g, disease_id, None)
if pub_id != '':
pub_id = 'PMID:'+pub_id
r = Reference(pub_id, Reference.ref_types['journal_article'])
r.addRefToGraph(self.g)
else:
pub_id = None
self._make_association('MESH:'+chem_id, disease_id, rel_id, ['PMID:'+pub_id])
if not self.testMode and limit is not None and line_counter >= limit:
break
return
示例3: _get_phenotypicseries_parents
def _get_phenotypicseries_parents(entry, g):
"""
Extract the phenotypic series parent relationship out of the entry
:param entry:
:return:
"""
gu = GraphUtils(curie_map.get())
omimid = 'OMIM:'+str(entry['mimNumber'])
# the phenotypic series mappings
serieslist = []
if 'phenotypicSeriesExists' in entry:
if entry['phenotypicSeriesExists'] is True:
if 'phenotypeMapList' in entry:
phenolist = entry['phenotypeMapList']
for p in phenolist:
serieslist.append(p['phenotypeMap']['phenotypicSeriesNumber'])
if 'geneMap' in entry and 'phenotypeMapList' in entry['geneMap']:
phenolist = entry['geneMap']['phenotypeMapList']
for p in phenolist:
if 'phenotypicSeriesNumber' in p['phenotypeMap']:
serieslist.append(p['phenotypeMap']['phenotypicSeriesNumber'])
# add this entry as a subclass of the series entry
for ser in serieslist:
series_id = 'OMIM:'+ser
gu.addClassToGraph(g, series_id, None)
gu.addSubclass(g, series_id, omimid)
return
示例4: _process_phenotypicseries
def _process_phenotypicseries(self, limit):
"""
Creates classes from the OMIM phenotypic series list. These are grouping classes
to hook the more granular OMIM diseases.
:param limit:
:return:
"""
if self.testMode:
g = self.testgraph
else:
g = self.graph
logger.info("getting phenotypic series titles")
gu = GraphUtils(curie_map.get())
line_counter = 0
start = False
with open('/'.join((self.rawdir, self.files['phenotypicSeries']['file']))) as f:
for line in f:
# there's several lines of header in the file, so need to skip several lines:
if not start:
if re.match('Phenotypic Series', line):
start = True
continue
if re.match('\w*$', line):
# skip blank lines
continue
line = line.strip()
line_counter += 1
(ps_label, ps_num) = line.split('\t')
omim_id = 'OMIM:'+ps_num
gu.addClassToGraph(g, omim_id, ps_label)
return
示例5: _process_collection
def _process_collection(self, collection_id, label, page):
"""
This function will process the data supplied internally
about the repository from Coriell.
Triples:
Repository a ERO:collection
rdf:label Literal(label)
foaf:page Literal(page)
:param collection_id:
:param label:
:param page:
:return:
"""
# ############# BUILD THE CELL LINE REPOSITORY #############
for g in [self.graph, self.testgraph]:
# FIXME: How to devise a label for each repository?
gu = GraphUtils(curie_map.get())
repo_id = 'CoriellCollection:'+collection_id
repo_label = label
repo_page = page
gu.addIndividualToGraph(
g, repo_id, repo_label, self.terms['collection'])
gu.addPage(g, repo_id, repo_page)
return
示例6: _map_eom_terms
def _map_eom_terms(self, raw, limit=None):
"""
This table contains the HP ID mappings from the local tsv file.
Triples:
<eom id> owl:equivalentClass <hp id>
:param raw:
:param limit:
:return:
"""
gu = GraphUtils(curie_map.get())
line_counter = 0
with open(raw, 'r') as f1:
f1.readline() # read the header row; skip
for line in f1:
line_counter += 1
(morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = line.split('\t')
# Sub out the underscores for colons.
hp_id = re.sub('_', ':', hp_id)
if re.match(".*HP:.*", hp_id):
# add the HP term as a class
gu.addClassToGraph(self.graph, hp_id, None)
# Add the HP ID as an equivalent class
gu.addEquivalentClass(self.graph, morphology_term_id, hp_id)
else:
logger.warning('No matching HP term for %s', morphology_term_label)
if limit is not None and line_counter > limit:
break
return
示例7: process_gene_desc
def process_gene_desc(self, limit):
raw = '/'.join((self.rawdir, self.files['gene_desc']['file']))
if self.testMode:
g = self.testgraph
else:
g = self.graph
gu = GraphUtils(curie_map.get())
logger.info("Processing Gene descriptions")
line_counter = 0
# geno = Genotype(g) # TODO unused
with gzip.open(raw, 'rb') as csvfile:
filereader = csv.reader(
io.TextIOWrapper(csvfile, newline=""), delimiter='\t',
quotechar='\"')
for row in filereader:
if re.match(r'\#', ''.join(row)):
continue
line_counter += 1
if line_counter == 1:
continue
(gene_num, public_name, molecular_name, concise_description,
provisional_description, detailed_description,
automated_description, gene_class_description) = row
if self.testMode and gene_num not in self.test_ids['gene']:
continue
gene_id = 'WormBase:'+gene_num
if concise_description != 'none available':
gu.addDefinition(g, gene_id, concise_description)
# remove the description if it's identical to the concise
descs = {
'provisional': provisional_description,
'automated': automated_description,
'detailed': detailed_description,
'gene class': gene_class_description
}
for d in descs:
text = descs.get(d)
if text == concise_description \
or re.match(r'none', text) or text == '':
pass # don't use it
else:
text = ' '.join((text, '['+d+']'))
descs[d] = text
gu.addDescription(g, gene_id, text)
if not self.testMode \
and limit is not None and line_counter > limit:
break
return
示例8: _get_gene_history
def _get_gene_history(self, limit):
"""
Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new
gene id is the replacement for it. The old gene symbol is added as a synonym to the gene.
:param limit:
:return:
"""
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
logger.info("Processing Gene records")
line_counter = 0
myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
logger.info("FILE: %s", myfile)
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match('^#', line):
continue
(tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t')
##### set filter=None in init if you don't want to have a filter
#if self.filter is not None:
# if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
# or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
# continue
##### end filter
if gene_num == '-' or discontinued_num == '-':
continue
if self.testMode and int(gene_num) not in self.gene_ids:
continue
if int(tax_num) not in self.tax_ids:
continue
line_counter += 1
gene_id = ':'.join(('NCBIGene', gene_num))
discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
tax_id = ':'.join(('NCBITaxon', tax_num))
# add the two genes
gu.addClassToGraph(g, gene_id, None)
gu.addClassToGraph(g, discontinued_gene_id, discontinued_symbol)
# add the new gene id to replace the old gene id
gu.addDeprecatedClass(g, discontinued_gene_id, [gene_id])
# also add the old symbol as a synonym of the new gene
gu.addSynonym(g, gene_id, discontinued_symbol)
if (not self.testMode) and (limit is not None and line_counter > limit):
break
return
示例9: process_pub_xrefs
def process_pub_xrefs(self, limit=None):
raw = '/'.join((self.rawdir, self.files['pub_xrefs']['file']))
if self.testMode:
g = self.testgraph
else:
g = self.graph
gu = GraphUtils(curie_map.get())
logger.info("Processing publication xrefs")
line_counter = 0
with open(raw, 'r') as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(wb_ref, xref) = row
# WBPaper00000009 pmid8805<BR>
# WBPaper00000011 doi10.1139/z78-244<BR>
# WBPaper00000012 cgc12<BR>
if self.testMode and wb_ref not in self.test_ids['pub']:
continue
ref_id = 'WormBase:'+wb_ref
xref_id = r = None
xref = re.sub(r'<BR>', '', xref)
xref = xref.strip()
if re.match(r'pmid', xref):
xref_id = 'PMID:'+re.sub(r'pmid\s*', '', xref)
r = Reference(
xref_id, Reference.ref_types['journal_article'])
elif re.search(r'[\(\)\<\>\[\]\s]', xref):
continue
elif re.match(r'doi', xref):
xref_id = 'DOI:'+re.sub(r'doi', '', xref.strip())
r = Reference(xref_id)
elif re.match(r'cgc', xref):
# TODO not sure what to do here with cgc xrefs
continue
else:
# logger.debug("Other xrefs like %s", xref)
continue
if xref_id is not None:
r.addRefToGraph(g)
gu.addSameIndividual(g, ref_id, xref_id)
if not self.testMode \
and limit is not None and line_counter > limit:
break
return
示例10: _process_pathways
def _process_pathways(self, limit=None):
"""
This method adds the KEGG pathway IDs.
These are the canonical pathways as defined in KEGG.
We also encode the graphical depiction
which maps 1:1 with the identifier.
Triples created:
<pathway_id> is a GO:signal_transduction
<pathway_id> rdfs:label <pathway_name>
<gene_id> RO:involved_in <pathway_id>
:param limit:
:return:
"""
logger.info("Processing pathways")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
path = Pathway(g, self.nobnodes)
gu = GraphUtils(curie_map.get())
raw = '/'.join((self.rawdir, self.files['pathway']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(pathway_id, pathway_name) = row
if self.testMode and \
pathway_id not in self.test_ids['pathway']:
continue
pathway_id = 'KEGG-'+pathway_id.strip()
path.addPathway(pathway_id, pathway_name)
# we know that the pathway images from kegg map 1:1 here.
# so add those
image_filename = re.sub(r'KEGG-path:', '', pathway_id) + '.png'
image_url = \
'http://www.genome.jp/kegg/pathway/map/'+image_filename
gu.addDepiction(g, pathway_id, image_url)
if not self.testMode and \
limit is not None and line_counter > limit:
break
logger.info("Done with pathways")
return
示例11: write
def write(self, format='rdfxml', stream=None):
"""
This convenience method will write out all of the graphs associated with the source.
Right now these are hardcoded to be a single "graph" and a "dataset".
If you do not supply stream='stdout' it will default write these to files
:return: None
"""
format_to_xtn = {
'rdfxml': 'xml', 'turtle': 'ttl'
}
# make the regular graph output file
file = None
if self.name is not None:
file = '/'.join((self.outdir, self.name))
if format in format_to_xtn:
file = '.'.join((file, format_to_xtn.get(format)))
else:
file = '.'.join((file, format))
# make the datasetfile name
datasetfile = '/'.join((self.outdir, self.name+'_dataset'))
if format in format_to_xtn:
datasetfile = '.'.join((datasetfile, format_to_xtn.get(format)))
else:
datasetfile = '.'.join((datasetfile, format))
else:
logger.warn("No output file set. Using stdout")
stream = 'stdout'
# start off with only the dataset descriptions
graphs = [
{'g': self.dataset.getGraph(), 'file': datasetfile},
]
# add the other graphs to the set to write, if not in the test mode
if self.testMode:
graphs += [{'g': self.testgraph, 'file': self.testfile}]
else:
graphs += [{'g': self.graph, 'file': file}]
gu = GraphUtils(None)
# loop through each of the graphs and print them out
for g in graphs:
if stream is None:
gu.write(g['g'], format, file=g['file'])
elif stream.lowercase().strip() == 'stdout':
gu.write(g['g'], format)
else:
logger.error("I don't understand your stream.")
return
示例12: _process_genes_kegg2ncbi
def _process_genes_kegg2ncbi(self, limit=None):
"""
This method maps the KEGG human gene IDs
to the corresponding NCBI Gene IDs.
Triples created:
<kegg_gene_id> is a class
<ncbi_gene_id> is a class
<kegg_gene_id> equivalentClass <ncbi_gene_id>
:param limit:
:return:
"""
logger.info("Processing KEGG gene IDs to NCBI gene IDs")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
gu = GraphUtils(curie_map.get())
raw = '/'.join((self.rawdir, self.files['ncbi']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(kegg_gene_id, ncbi_gene_id, link_type) = row
if self.testMode and \
kegg_gene_id not in self.test_ids['genes']:
continue
# Adjust the NCBI gene ID prefix.
ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id)
kegg_gene_id = 'KEGG-'+kegg_gene_id
# Adding the KEGG gene ID to the graph here is redundant,
# unless there happens to be additional gene IDs in this table
# not present in the genes table.
gu.addClassToGraph(g, kegg_gene_id, None)
gu.addClassToGraph(g, ncbi_gene_id, None)
gu.addEquivalentClass(g, kegg_gene_id, ncbi_gene_id)
if (not self.testMode) and (
limit is not None and line_counter > limit):
break
logger.info("Done with KEGG gene IDs to NCBI gene IDs")
return
示例13: make_association_id
def make_association_id(definedby, sub, pred, obj, attributes=None):
"""
A method to create unique identifiers for OBAN-style associations,
based on all the parts of the association
If any of the items is empty or None, it will convert it to blank.
It effectively digests the string of concatonated values.
Subclasses of Assoc can submit an additional array of attributes
that will be appeded to the ID.
Note this is equivalent to a RDF blank node
:param definedby: The (data) resource that provided the annotation
:param subject:
:param predicate:
:param object:
:param attributes:
:return:
"""
items_to_hash = [definedby, sub, pred, obj]
if attributes is not None and len(attributes) > 0:
items_to_hash += attributes
items_to_hash = [x for x in items_to_hash if x is not None]
assoc_id = ':'.join(('MONARCH', GraphUtils.digest_id('+'.join(items_to_hash))))
assert assoc_id is not None
return assoc_id
示例14: __init__
def __init__(self):
Source.__init__(self, 'ctd')
self.dataset = Dataset(
'ctd', 'CTD', 'http://ctdbase.org', None,
'http://ctdbase.org/about/legal.jsp')
if 'test_ids' not in config.get_config() \
or 'gene' not in config.get_config()['test_ids']:
logger.warning("not configured with gene test ids.")
self.test_geneids = []
else:
self.test_geneids = config.get_config()['test_ids']['gene']
if 'test_ids' not in config.get_config() \
or 'disease' not in config.get_config()['test_ids']:
logger.warning("not configured with disease test ids.")
self.test_diseaseids = []
else:
self.test_diseaseids = config.get_config()['test_ids']['disease']
self.gu = GraphUtils(curie_map.get())
self.g = self.graph
self.geno = Genotype(self.g)
return
示例15: __init__
def __init__(self):
Source.__init__(self, 'mpd')
# @N, not sure if this step is required
self.namespaces.update(curie_map.get())
self.stdevthreshold = 2
self.nobnodes = True # FIXME
# update the dataset object with details about this resource
# @N: Note that there is no license as far as I can tell
self.dataset = Dataset(
'mpd', 'MPD', 'http://phenome.jax.org', None, None)
# TODO add a citation for mpd dataset as a whole
self.dataset.set_citation('PMID:15619963')
self.assayhash = {}
self.idlabel_hash = {}
# to store the mean/zscore of each measure by strain+sex
self.score_means_by_measure = {}
# to store the mean value for each measure by strain+sex
self.strain_scores_by_measure = {}
self.geno = Genotype(self.graph)
self.gu = GraphUtils(curie_map.get())
return