本文整理汇总了Python中dipper.utils.GraphUtils.GraphUtils.addEquivalentClass方法的典型用法代码示例。如果您正苦于以下问题:Python GraphUtils.addEquivalentClass方法的具体用法?Python GraphUtils.addEquivalentClass怎么用?Python GraphUtils.addEquivalentClass使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.utils.GraphUtils.GraphUtils
的用法示例。
在下文中一共展示了GraphUtils.addEquivalentClass方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _map_eom_terms
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _map_eom_terms(self, raw, limit=None):
"""
This table contains the HP ID mappings from the local tsv file.
Triples:
<eom id> owl:equivalentClass <hp id>
:param raw:
:param limit:
:return:
"""
gu = GraphUtils(curie_map.get())
line_counter = 0
with open(raw, 'r') as f1:
f1.readline() # read the header row; skip
for line in f1:
line_counter += 1
(morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = line.split('\t')
# Sub out the underscores for colons.
hp_id = re.sub('_', ':', hp_id)
if re.match(".*HP:.*", hp_id):
# add the HP term as a class
gu.addClassToGraph(self.graph, hp_id, None)
# Add the HP ID as an equivalent class
gu.addEquivalentClass(self.graph, morphology_term_id, hp_id)
else:
logger.warning('No matching HP term for %s', morphology_term_label)
if limit is not None and line_counter > limit:
break
return
示例2: _get_process_allelic_variants
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _get_process_allelic_variants(self, entry, g):
gu = GraphUtils(curie_map.get())
geno = Genotype(g)
du = DipperUtil()
if entry is not None:
publist = {} # to hold the entry-specific publication mentions for the allelic variants
entry_num = entry['mimNumber']
# process the ref list just to get the pmids
ref_to_pmid = self._get_pubs(entry, g)
if 'allelicVariantList' in entry:
allelicVariantList = entry['allelicVariantList']
for al in allelicVariantList:
al_num = al['allelicVariant']['number']
al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4)
al_label = None
al_description = None
if al['allelicVariant']['status'] == 'live':
publist[al_id] = set()
if 'mutations' in al['allelicVariant']:
al_label = al['allelicVariant']['mutations']
if 'text' in al['allelicVariant']:
al_description = al['allelicVariant']['text']
m = re.findall('\{(\d+)\:', al_description)
publist[al_id] = set(m)
geno.addAllele(al_id, al_label, geno.genoparts['variant_locus'], al_description)
geno.addAlleleOfGene(al_id, 'OMIM:'+str(entry_num),
geno.object_properties['is_sequence_variant_instance_of'])
for r in publist[al_id]:
pmid = ref_to_pmid[int(r)]
gu.addTriple(g, pmid, gu.object_properties['is_about'], al_id)
# look up the pubmed id in the list of references
if 'dbSnps' in al['allelicVariant']:
dbsnp_ids = re.split(',', al['allelicVariant']['dbSnps'])
for dnum in dbsnp_ids:
did = 'dbSNP:'+dnum.strip()
gu.addIndividualToGraph(g, did, None)
gu.addEquivalentClass(g, al_id, did)
if 'clinvarAccessions' in al['allelicVariant']:
# clinvarAccessions triple semicolon delimited, each lik eRCV000020059;;1
rcv_ids = re.split(';;;', al['allelicVariant']['clinvarAccessions'])
rcv_ids = [(re.match('(RCV\d+)\;\;', r)).group(1) for r in rcv_ids]
for rnum in rcv_ids:
rid = 'ClinVar:'+rnum
gu.addXref(g, al_id, rid)
gu.addPage(g, al_id, "http://omim.org/entry/"+str(entry_num)+"#"+str(al_num).zfill(4))
elif re.search('moved', al['allelicVariant']['status']):
# for both 'moved' and 'removed'
moved_ids = None
if 'movedTo' in al['allelicVariant']:
moved_id = 'OMIM:'+al['allelicVariant']['movedTo']
moved_ids = [moved_id]
gu.addDeprecatedIndividual(g, al_id, moved_ids)
else:
logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status'])
# end loop allelicVariantList
return
示例3: _process_genes
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _process_genes(self, taxid, limit=None):
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
raw = '/'.join((self.rawdir, self.files[taxid]['file']))
line_counter = 0
logger.info("Processing Ensembl genes for tax %s", taxid)
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t')
for row in filereader:
if len(row) < 4:
logger.error("Data error for file %s", raw)
return
(ensembl_gene_id, external_gene_name, description,
gene_biotype, entrezgene) = row[0:5]
# in the case of human genes, we also get the hgnc id,
# and is the last col
if taxid == '9606':
hgnc_id = row[5]
else:
hgnc_id = None
if self.testMode and entrezgene != '' \
and int(entrezgene) not in self.gene_ids:
continue
line_counter += 1
gene_id = 'ENSEMBL:'+ensembl_gene_id
if description == '':
description = None
gene_type_id = self._get_gene_type(gene_biotype)
gene_type_id = None
gu.addClassToGraph(
g, gene_id, external_gene_name, gene_type_id, description)
if entrezgene != '':
gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene)
if hgnc_id is not None and hgnc_id != '':
gu.addEquivalentClass(g, gene_id, hgnc_id)
geno.addTaxon('NCBITaxon:'+taxid, gene_id)
if not self.testMode \
and limit is not None and line_counter > limit:
break
gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
return
示例4: _process_genes_kegg2ncbi
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _process_genes_kegg2ncbi(self, limit=None):
"""
This method maps the KEGG human gene IDs
to the corresponding NCBI Gene IDs.
Triples created:
<kegg_gene_id> is a class
<ncbi_gene_id> is a class
<kegg_gene_id> equivalentClass <ncbi_gene_id>
:param limit:
:return:
"""
logger.info("Processing KEGG gene IDs to NCBI gene IDs")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
gu = GraphUtils(curie_map.get())
raw = '/'.join((self.rawdir, self.files['ncbi']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(kegg_gene_id, ncbi_gene_id, link_type) = row
if self.testMode and \
kegg_gene_id not in self.test_ids['genes']:
continue
# Adjust the NCBI gene ID prefix.
ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id)
kegg_gene_id = 'KEGG-'+kegg_gene_id
# Adding the KEGG gene ID to the graph here is redundant,
# unless there happens to be additional gene IDs in this table
# not present in the genes table.
gu.addClassToGraph(g, kegg_gene_id, None)
gu.addClassToGraph(g, ncbi_gene_id, None)
gu.addEquivalentClass(g, kegg_gene_id, ncbi_gene_id)
if (not self.testMode) and (
limit is not None and line_counter > limit):
break
logger.info("Done with KEGG gene IDs to NCBI gene IDs")
return
示例5: _process_pathway_pathway
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _process_pathway_pathway(self, limit):
"""
There are "map" and "ko" identifiers for pathways.
This makes equivalence mapping between them, where they exist.
:param limit:
:return:
"""
logger.info("Processing KEGG pathways to other ids")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
gu = GraphUtils(curie_map.get())
raw = '/'.join((self.rawdir, self.files['pathway_pathway']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(pathway_id_1, pathway_id_2) = row
if self.testMode and \
pathway_id_1 not in self.test_ids['pathway']:
continue
pathway_id_1 = 'KEGG-'+pathway_id_1
# will look like KEGG-path:map04130 or KEGG-path:ko04130
pathway_id_2 = 'KEGG-'+pathway_id_2
if pathway_id_1 != pathway_id_2:
gu.addEquivalentClass(g, pathway_id_1, pathway_id_2)
if not self.testMode and \
limit is not None and line_counter > limit:
break
return
示例6: _get_mappedids
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _get_mappedids(self, entry, g):
"""
Extract the Orphanet and UMLS ids as equivalences from the entry
:param entry:
:return:
"""
# umlsIDs
gu = GraphUtils(curie_map.get())
omimid = 'OMIM:'+str(entry['mimNumber'])
orpha_mappings = []
if 'externalLinks' in entry:
links = entry['externalLinks']
if 'orphanetDiseases' in links:
# triple semi-colon delimited list of double semi-colon delimited orphanet ID/disease pairs
# 2970;;566;;Prune belly syndrome
items = links['orphanetDiseases'].split(';;;')
for i in items:
(orpha_num, internal_num, orpha_label) = i.split(';;')
orpha_id = 'Orphanet:'+orpha_num.strip()
orpha_mappings.append(orpha_id)
gu.addClassToGraph(g, orpha_id, orpha_label.strip())
gu.addXref(g, omimid, orpha_id)
if 'umlsIDs' in links:
umls_mappings = links['umlsIDs'].split(',')
for i in umls_mappings:
umls_id = 'UMLS:'+i
gu.addClassToGraph(g, umls_id, None)
gu.addXref(g, omimid, umls_id)
if self._get_omimtype(entry) == Genotype.genoparts['gene'] and 'geneIDs' in links:
entrez_mappings = links['geneIDs']
for i in entrez_mappings.split(','):
gu.addEquivalentClass(g, omimid, 'NCBIGene:'+str(i))
return
示例7: _process_diseasegene
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
#.........这里部分代码省略.........
# get the element name and id
# id = elem.get('id') # some internal identifier
disorder_num = elem.find("OrphaNumber").text
disorder_id = "Orphanet:" + str(disorder_num)
if self.testMode and disorder_id not in config.get_config()["test_ids"]["disease"]:
continue
disorder_label = elem.find("Name").text
# make a hash of internal gene id to type for later lookup
gene_iid_to_type = {}
gene_list = elem.find("GeneList")
for gene in gene_list.findall("Gene"):
gene_iid = gene.get("id")
gene_type = gene.find("GeneType").get("id")
gene_iid_to_type[gene_iid] = gene_type
gu.addClassToGraph(g, disorder_id, disorder_label) # assuming that these are in the ontology
assoc_list = elem.find("DisorderGeneAssociationList")
for a in assoc_list.findall("DisorderGeneAssociation"):
gene_iid = a.find(".//Gene").get("id")
gene_name = a.find(".//Gene/Name").text
gene_symbol = a.find(".//Gene/Symbol").text
gene_num = a.find("./Gene/OrphaNumber").text
gene_id = "Orphanet:" + str(gene_num)
gene_type_id = self._map_gene_type_id(gene_iid_to_type[gene_iid])
gu.addClassToGraph(g, gene_id, gene_symbol, gene_type_id, gene_name)
syn_list = a.find("./Gene/SynonymList")
if int(syn_list.get("count")) > 0:
for s in syn_list.findall("./Synonym"):
gu.addSynonym(g, gene_id, s.text)
dgtype = a.find("DisorderGeneAssociationType").get("id")
rel_id = self._map_rel_id(dgtype)
dg_label = a.find("./DisorderGeneAssociationType/Name").text
if rel_id is None:
logger.warn(
"Cannot map association type (%s) to RO for association (%s | %s). Skipping.",
dg_label,
disorder_label,
gene_symbol,
)
continue
alt_locus_id = "_" + gene_num + "-" + disorder_num + "VL"
alt_label = " ".join(
("some variant of", gene_symbol.strip(), "that is a", dg_label.lower(), disorder_label)
)
if self.nobnodes:
alt_locus_id = ":" + alt_locus_id
gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts["variant_locus"])
geno.addAlleleOfGene(alt_locus_id, gene_id)
# consider typing the gain/loss-of-function variants like:
# http://sequenceontology.org/browser/current_svn/term/SO:0002054
# http://sequenceontology.org/browser/current_svn/term/SO:0002053
# use "assessed" status to issue an evidence code
# FIXME I think that these codes are sub-optimal
status_code = a.find("DisorderGeneAssociationStatus").get("id")
eco_id = "ECO:0000323" # imported automatically asserted information used in automatic assertion
if status_code == "17991": # Assessed # TODO are these internal ids stable between releases?
eco_id = "ECO:0000322" # imported manually asserted information used in automatic assertion
# Non-traceable author statement ECO_0000034
# imported information in automatic assertion ECO_0000313
assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id)
assoc.add_evidence(eco_id)
assoc.add_association_to_graph(g)
rlist = a.find("./Gene/ExternalReferenceList")
eqid = None
for r in rlist.findall("ExternalReference"):
if r.find("Source").text == "Ensembl":
eqid = "ENSEMBL:" + r.find("Reference").text
elif r.find("Source").text == "HGNC":
eqid = "HGNC:" + r.find("Reference").text
elif r.find("Source").text == "OMIM":
eqid = "OMIM:" + r.find("Reference").text
else:
pass # skip the others for now
if eqid is not None:
gu.addClassToGraph(g, eqid, None)
gu.addEquivalentClass(g, gene_id, eqid)
pass
elem.clear() # discard the element
if self.testMode and limit is not None and line_counter > limit:
return
gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
gu.loadAllProperties(g)
return
示例8: _get_gene_info
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _get_gene_info(self, limit):
"""
Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their
label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as
protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located
on the chr band.
:param limit:
:return:
"""
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
# not unzipping the file
logger.info("Processing Gene records")
line_counter = 0
myfile = '/'.join((self.rawdir, self.files['gene_info']['file']))
logger.info("FILE: %s", myfile)
# Add taxa and genome classes for those in our filter
for tax_num in self.tax_ids:
tax_id = ':'.join(('NCBITaxon', str(tax_num)))
geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere
gu.addClassToGraph(g, tax_id, None) # label added elsewhere
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match('^#', line):
continue
(tax_num, gene_num, symbol, locustag,
synonyms, xrefs, chr, map_loc, desc,
gtype, authority_symbol, name,
nomenclature_status, other_designations, modification_date) = line.split('\t')
##### set filter=None in init if you don't want to have a filter
#if self.filter is not None:
# if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
# or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
# continue
##### end filter
if self.testMode and int(gene_num) not in self.gene_ids:
continue
if int(tax_num) not in self.tax_ids:
continue
line_counter += 1
gene_id = ':'.join(('NCBIGene', gene_num))
tax_id = ':'.join(('NCBITaxon', tax_num))
gene_type_id = self._map_type_of_gene(gtype)
if symbol == 'NEWENTRY':
label = None
else:
label = symbol
# TODO might have to figure out if things aren't genes, and make them individuals
gu.addClassToGraph(g, gene_id, label, gene_type_id, desc)
# we have to do special things here for genes, because they're classes not individuals
# f = Feature(gene_id,label,gene_type_id,desc)
if name != '-':
gu.addSynonym(g, gene_id, name)
if synonyms.strip() != '-':
for s in synonyms.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
if other_designations.strip() != '-':
for s in other_designations.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
# deal with the xrefs
# MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
if xrefs.strip() != '-':
for r in xrefs.strip().split('|'):
fixedr = self._cleanup_id(r)
if fixedr is not None and fixedr.strip() != '':
if re.match('HPRD', fixedr):
# proteins are not == genes.
gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr)
else:
# skip some of these for now
if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']:
gu.addEquivalentClass(g, gene_id, fixedr)
# edge cases of id | symbol | chr | map_loc:
# 263 AMD1P2 X|Y with Xq28 and Yq12
# 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR
# 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies
# 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR
# 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility
# 101928066 LOC101928066 1|Un - # unlocated scaffold
#.........这里部分代码省略.........
示例9: _get_identifiers
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _get_identifiers(self, limit):
"""
This will process the id mapping file provided by Biogrid.
The file has a very large header, which we scan past,
then pull the identifiers, and make equivalence axioms
:param limit:
:return:
"""
logger.info("getting identifier mapping")
line_counter = 0
f = '/'.join((self.rawdir, self.files['identifiers']['file']))
myzip = ZipFile(f, 'r')
# assume that the first entry is the item
fname = myzip.namelist()[0]
foundheader = False
gu = GraphUtils(curie_map.get())
# TODO align this species filter with the one above
# speciesfilters = 'Homo sapiens,Mus musculus,Drosophila melanogaster,
# Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',')
speciesfilters = 'Homo sapiens,Mus musculus'.split(',')
with myzip.open(fname, 'r') as csvfile:
for line in csvfile:
# skip header lines
if not foundheader:
if re.match(r'BIOGRID_ID', line.decode()):
foundheader = True
continue
line = line.decode().strip()
# BIOGRID_ID
# IDENTIFIER_VALUE
# IDENTIFIER_TYPE
# ORGANISM_OFFICIAL_NAME
# 1 814566 ENTREZ_GENE Arabidopsis thaliana
(biogrid_num, id_num, id_type,
organism_label) = line.split('\t')
if self.testMode:
g = self.testgraph
# skip any genes that don't match our test set
if int(biogrid_num) not in self.biogrid_ids:
continue
else:
g = self.graph
# for each one of these,
# create the node and add equivalent classes
biogrid_id = 'BIOGRID:'+biogrid_num
prefix = self._map_idtype_to_prefix(id_type)
# TODO make these filters available as commandline options
# geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,
# WormBase,XenBase,ENSEMBL,miRBase'.split(',')
geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',')
# proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
if (speciesfilters is not None) \
and (organism_label.strip() in speciesfilters):
line_counter += 1
if (geneidtypefilters is not None) \
and (prefix in geneidtypefilters):
mapped_id = ':'.join((prefix, id_num))
gu.addEquivalentClass(g, biogrid_id, mapped_id)
# this symbol will only get attached to the biogrid class
elif id_type == 'OFFICIAL_SYMBOL':
gu.addClassToGraph(g, biogrid_id, id_num)
# elif (id_type == 'SYNONYM'):
# FIXME - i am not sure these are synonyms, altids?
# gu.addSynonym(g,biogrid_id,id_num)
if not self.testMode and limit is not None \
and line_counter > limit:
break
myzip.close()
return
示例10: _process_omim2disease
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _process_omim2disease(self, limit=None):
"""
This method maps the KEGG disease IDs to the corresponding OMIM disease IDs.
Currently this only maps KEGG diseases and OMIM diseases that have a 1:1 mapping.
Triples created:
<kegg_disease_id> is a class
<omim_disease_id> is a class
<kegg_disease_id> hasXref <omim_disease_id>
:param limit:
:return:
"""
logger.info("Processing 1:1 KEGG disease to OMIM disease mappings")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
gu = GraphUtils(curie_map.get())
raw = '/'.join((self.rawdir, self.files['omim2disease']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
(omim_disease_id, kegg_disease_id, link_type) = row
kegg_disease_id = 'KEGG-'+kegg_disease_id.strip()
omim_disease_id = re.sub('omim', 'OMIM', omim_disease_id)
# Create hash for the links from OMIM ID -> KEGG ID
if omim_disease_id not in self.omim_disease_hash:
self.omim_disease_hash[omim_disease_id] = [kegg_disease_id]
else:
self.omim_disease_hash[omim_disease_id].append(kegg_disease_id)
# Create hash for the links from KEGG ID -> OMIM ID
if kegg_disease_id not in self.kegg_disease_hash:
self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id]
else:
self.kegg_disease_hash[kegg_disease_id].append(omim_disease_id)
# Now process the disease hashes and only process 1:1 omim disease:KEGG disease entries.
for omim_disease_id in self.omim_disease_hash:
if self.testMode and omim_disease_id not in self.test_ids['disease']:
continue
if (not self.testMode) and (limit is not None and line_counter > limit):
break
line_counter += 1
if len(self.omim_disease_hash[omim_disease_id]) == 1:
kegg_disease_id = ''.join(self.omim_disease_hash.get(omim_disease_id))
if len(self.kegg_disease_hash[kegg_disease_id]) == 1:
# add ids, and deal with the labels separately
gu.addClassToGraph(g, kegg_disease_id, None)
gu.addClassToGraph(g, omim_disease_id, None)
gu.addEquivalentClass(g, kegg_disease_id, omim_disease_id) # safe?
# gu.addXref(g, kegg_disease_id, omim_disease_id)
logger.info("Done with KEGG disease to OMIM disease mappings.")
return
示例11: _process_omim2gene
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _process_omim2gene(self, limit=None):
"""
This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field.
Equivalent link types are mapped as gene XRefs.
Reverse link types are mapped as disease to gene associations.
Original link types are currently skipped.
Triples created:
<kegg_gene_id> is a Gene
<omim_gene_id> is a Gene
<kegg_gene_id>> hasXref <omim_gene_id>
<assoc_id> has subject <omim_disease_id>
<assoc_id> has object <kegg_gene_id>
:param limit:
:return:
"""
logger.info("Processing OMIM to KEGG gene")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
geno = Genotype(g)
gu = GraphUtils(curie_map.get())
raw = '/'.join((self.rawdir, self.files['omim2gene']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(kegg_gene_id, omim_id, link_type) = row
if self.testMode and kegg_gene_id not in self.test_ids['genes']:
continue
kegg_gene_id = 'KEGG-'+kegg_gene_id.strip()
omim_id = re.sub('omim', 'OMIM', omim_id)
if link_type == 'equivalent':
# these are genes! so add them as a class then make equivalence
gu.addClassToGraph(g, omim_id, None)
geno.addGene(kegg_gene_id, None)
gu.addEquivalentClass(g, kegg_gene_id, omim_id)
elif link_type == 'reverse':
# make an association between an OMIM ID and the KEGG gene ID
# we do this with omim ids because they are more atomic than KEGG ids
alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id)
alt_label = self.label_hash[alt_locus_id]
gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus'])
geno.addAlleleOfGene(alt_locus_id, kegg_gene_id)
# Add the disease to gene relationship.
rel = gu.object_properties['is_marker_for']
assoc = G2PAssoc(self.name, alt_locus_id, omim_id, rel)
assoc.add_association_to_graph(g)
elif link_type == 'original':
# these are sometimes a gene, and sometimes a disease
logger.info('Unable to handle original link for %s-%s', kegg_gene_id, omim_id)
else:
# don't know what these are
logger.warn('Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type)
if (not self.testMode) and (limit is not None and line_counter > limit):
break
logger.info("Done with OMIM to KEGG gene")
gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
return
示例12: _process_genes
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _process_genes(self, limit=None):
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
raw = '/'.join((self.rawdir, self.files['genes']['file']))
line_counter = 0
logger.info("Processing HGNC genes")
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
(hgnc_id, symbol, name, locus_group, locus_type, status,
location, location_sortable, alias_symbol, alias_name,
prev_symbol, prev_name, gene_family, gene_family_id,
date_approved_reserved, date_symbol_changed,
date_name_changed, date_modified, entrez_id, ensembl_gene_id,
vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids,
pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase,
homeodb, snornabase, bioparadigms_slc, orphanet,
pseudogene_org, horde_id, merops, imgt, iuphar,
kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id,
intermediate_filament_db) = row
line_counter += 1
# skip header
if line_counter <= 1:
continue
if self.testMode and entrez_id != '' \
and int(entrez_id) not in self.gene_ids:
continue
if name == '':
name = None
gene_type_id = self._get_gene_type(locus_type)
gu.addClassToGraph(g, hgnc_id, symbol, gene_type_id, name)
if locus_type == 'withdrawn':
gu.addDeprecatedClass(g, hgnc_id)
if entrez_id != '':
gu.addEquivalentClass(
g, hgnc_id, 'NCBIGene:' + entrez_id)
if ensembl_gene_id != '':
gu.addEquivalentClass(
g, hgnc_id, 'ENSEMBL:' + ensembl_gene_id)
geno.addTaxon('NCBITaxon:9606', hgnc_id)
# add pubs as "is about"
if pubmed_id != '':
for p in re.split(r'\|', pubmed_id.strip()):
if str(p) != '':
gu.addTriple(
g, 'PMID:' + str(p.strip()),
gu.object_properties['is_about'], hgnc_id)
# add chr location
# sometimes two are listed, like: 10p11.2 or 17q25
# -- there are only 2 of these FRA10A and MPFD
# sometimes listed like "1 not on reference assembly"
# sometimes listed like 10q24.1-q24.3
# sometimes like 11q11 alternate reference locus
band = chrom = None
chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]'
chr_match = re.match(chr_pattern, location)
if chr_match is not None and len(chr_match.groups()) > 0:
chrom = chr_match.group(1)
chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR')
band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)'
band_match = re.search(band_pattern, location)
f = Feature(hgnc_id, None, None)
if band_match is not None and len(band_match.groups()) > 0:
band = band_match.group(1)
band = chrom + band
# add the chr band as the parent to this gene
# as a feature but assume that the band is created
# as a class with properties elsewhere in Monochrom
# TEC Monoch? Monarchdom??
band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR')
gu.addClassToGraph(g, band_id, None)
f.addSubsequenceOfFeature(g, band_id)
else:
gu.addClassToGraph(g, chrom_id, None)
f.addSubsequenceOfFeature(g, chrom_id)
if not self.testMode \
and limit is not None and line_counter > limit:
break
# end loop through file
gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
#.........这里部分代码省略.........
示例13: _process_trait_mappings
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
def _process_trait_mappings(self, raw, limit=None):
"""
This method
Triples created:
:param limit:
:return:
"""
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
gu = GraphUtils(curie_map.get())
# with open(raw, 'r') as csvfile:
# filereader = csv.reader(csvfile, delimiter=',')
# row_count = sum(1 for row in filereader)
# row_count = row_count - 1
with open(raw, 'r') as csvfile:
filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
next(filereader, None) # skip header line
for row in filereader:
line_counter += 1
# need to skip the last line
if len(row) < 8:
logger.info("skipping line %d: %s", line_counter, '\t'.join(row))
continue
(vto_id, pto_id, cmo_id, ato_column, species, trait_class, trait_type, qtl_count) = row
ato_id = re.sub('ATO #', 'AQTLTrait:', re.sub('\].*', '', re.sub('\[', '', ato_column)))
ato_label = re.sub('.*\]\s*', '', ato_column)
# if species == 'Cattle':
# ato_id = re.sub('ATO:', 'AQTLTraitCattle:', ato_id)
# elif species == 'Chicken':
# ato_id = re.sub('ATO:', 'AQTLTraitChicken:', ato_id)
# elif species == 'Sheep':
# ato_id = re.sub('ATO:', 'AQTLTraitSheep:', ato_id)
# elif species == 'Horse':
# ato_id = re.sub('ATO:', 'AQTLTraitHorse:', ato_id)
# elif species == 'Pig':
# ato_id = re.sub('ATO:', 'AQTLTraitPig:', ato_id)
# elif species == 'Rainbow trout':
# ato_id = re.sub('ATO:', 'AQTLTraitRainbowTrout:', ato_id)
# else:
# logger.warn(' Unknown species %s found in trait mapping file.', species)
# continue
#print(ato_label)
gu.addClassToGraph(g, ato_id, ato_label.strip())
if re.match('VT:.*', vto_id):
gu.addClassToGraph(g, vto_id, None)
gu.addEquivalentClass(g, ato_id, vto_id)
if re.match('PT:.*', pto_id):
gu.addClassToGraph(g, pto_id, None)
gu.addEquivalentClass(g, ato_id, pto_id)
if re.match('CMO:.*', cmo_id):
gu.addClassToGraph(g, cmo_id, None)
gu.addXref(g, ato_id, cmo_id)
logger.info("Done with trait mappings")
return
示例14: _process_diseasegene
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addEquivalentClass [as 别名]
#.........这里部分代码省略.........
gene_iid_to_type = {}
gene_list = elem.find('GeneList')
for gene in gene_list.findall('Gene'):
gene_iid = gene.get('id')
gene_type = gene.find('GeneType').get('id')
gene_iid_to_type[gene_iid] = gene_type
# assuming that these are in the ontology
gu.addClassToGraph(g, disorder_id, disorder_label)
assoc_list = elem.find('DisorderGeneAssociationList')
for a in assoc_list.findall('DisorderGeneAssociation'):
gene_iid = a.find('.//Gene').get('id')
gene_name = a.find('.//Gene/Name').text
gene_symbol = a.find('.//Gene/Symbol').text
gene_num = a.find('./Gene/OrphaNumber').text
gene_id = 'Orphanet:'+str(gene_num)
gene_type_id = \
self._map_gene_type_id(gene_iid_to_type[gene_iid])
gu.addClassToGraph(
g, gene_id, gene_symbol, gene_type_id, gene_name)
syn_list = a.find('./Gene/SynonymList')
if int(syn_list.get('count')) > 0:
for s in syn_list.findall('./Synonym'):
gu.addSynonym(g, gene_id, s.text)
dgtype = a.find('DisorderGeneAssociationType').get('id')
rel_id = self._map_rel_id(dgtype)
dg_label = \
a.find('./DisorderGeneAssociationType/Name').text
if rel_id is None:
logger.warning(
"Cannot map association type (%s) to RO " +
"for association (%s | %s). Skipping.",
dg_label, disorder_label, gene_symbol)
continue
alt_locus_id = '_'+gene_num+'-'+disorder_num+'VL'
alt_label = \
' '.join(('some variant of', gene_symbol.strip(),
'that is a', dg_label.lower(),
disorder_label))
if self.nobnodes:
alt_locus_id = ':'+alt_locus_id
gu.addIndividualToGraph(g, alt_locus_id, alt_label,
geno.genoparts['variant_locus'])
geno.addAlleleOfGene(alt_locus_id, gene_id)
# consider typing the gain/loss-of-function variants like:
# http://sequenceontology.org/browser/current_svn/term/SO:0002054
# http://sequenceontology.org/browser/current_svn/term/SO:0002053
# use "assessed" status to issue an evidence code
# FIXME I think that these codes are sub-optimal
status_code = \
a.find('DisorderGeneAssociationStatus').get('id')
# imported automatically asserted information
# used in automatic assertion
eco_id = 'ECO:0000323'
# Assessed
# TODO are these internal ids stable between releases?
if status_code == '17991':
# imported manually asserted information
# used in automatic assertion
eco_id = 'ECO:0000322'
# Non-traceable author statement ECO_0000034
# imported information in automatic assertion ECO_0000313
assoc = G2PAssoc(self.name, alt_locus_id,
disorder_id, rel_id)
assoc.add_evidence(eco_id)
assoc.add_association_to_graph(g)
rlist = a.find('./Gene/ExternalReferenceList')
eqid = None
for r in rlist.findall('ExternalReference'):
if r.find('Source').text == 'Ensembl':
eqid = 'ENSEMBL:'+r.find('Reference').text
elif r.find('Source').text == 'HGNC':
eqid = 'HGNC:'+r.find('Reference').text
elif r.find('Source').text == 'OMIM':
eqid = 'OMIM:'+r.find('Reference').text
else:
pass # skip the others for now
if eqid is not None:
gu.addClassToGraph(g, eqid, None)
gu.addEquivalentClass(g, gene_id, eqid)
elem.clear() # discard the element
if self.testMode and limit is not None and line_counter > limit:
return
gu.loadProperties(
g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
gu.loadAllProperties(g)
return