本文整理汇总了Python中dipper.models.Model.Model.addEquivalentClass方法的典型用法代码示例。如果您正苦于以下问题:Python Model.addEquivalentClass方法的具体用法?Python Model.addEquivalentClass怎么用?Python Model.addEquivalentClass使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.models.Model.Model
的用法示例。
在下文中一共展示了Model.addEquivalentClass方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _map_eom_terms
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _map_eom_terms(self, raw, limit=None):
"""
This table contains the HP ID mappings from the local tsv file.
Triples:
<eom id> owl:equivalentClass <hp id>
:param raw:
:param limit:
:return:
"""
model = Model(self.graph)
line_counter = 0
with open(raw, 'r') as f1:
f1.readline() # read the header row; skip
for line in f1:
line_counter += 1
row = line.split('\t')
(
morphology_term_id, morphology_term_label, hp_id, hp_label,
notes) = row
# Sub out the underscores for colons.
hp_id = re.sub('_', ':', hp_id)
if re.match(".*HP:.*", hp_id):
# add the HP term as a class
model.addClassToGraph(hp_id, None)
# Add the HP ID as an equivalent class
model.addEquivalentClass(morphology_term_id, hp_id)
else:
LOG.warning('No matching HP term for %s', morphology_term_label)
if limit is not None and line_counter > limit:
break
return
示例2: _add_gene_equivalencies
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _add_gene_equivalencies(self, xrefs, gene_id, taxon):
"""
Add equivalentClass and sameAs relationships
Uses external resource map located in
/resources/clique_leader.yaml to determine
if an NCBITaxon ID space is a clique leader
"""
clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
model = Model(graph)
filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport']
# deal with the dbxrefs
# MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
for dbxref in xrefs.strip().split('|'):
prefix = ':'.join(dbxref.split(':')[:-1]).strip()
if prefix in self.localtt:
prefix = self.localtt[prefix]
dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1]))
if dbxref_curie is not None and prefix != '':
if prefix == 'HPRD': # proteins are not == genes.
model.addTriple(
gene_id, self.globaltt['has gene product'], dbxref_curie)
continue
# skip some of these for now based on curie prefix
if prefix in filter_out:
continue
if prefix == 'ENSEMBL':
model.addXref(gene_id, dbxref_curie)
if prefix == 'OMIM':
if dbxref_curie in self.omim_replaced:
repl = self.omim_replaced[dbxref_curie]
for omim in repl:
if omim in self.omim_type and \
self.omim_type[omim] == self.globaltt['gene']:
dbxref_curie = omim
if dbxref_curie in self.omim_type and \
self.omim_type[dbxref_curie] != self.globaltt['gene']:
continue
try:
if self.class_or_indiv.get(gene_id) == 'C':
model.addEquivalentClass(gene_id, dbxref_curie)
if taxon in clique_map:
if clique_map[taxon] == prefix:
model.makeLeader(dbxref_curie)
elif clique_map[taxon] == gene_id.split(':')[0]:
model.makeLeader(gene_id)
else:
model.addSameIndividual(gene_id, dbxref_curie)
except AssertionError as err:
LOG.warning("Error parsing %s: %s", gene_id, err)
示例3: _add_gene_equivalencies
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _add_gene_equivalencies(self, xrefs, gene_id, taxon):
"""
Add equivalentClass and sameAs relationships
Uses external resource map located in
/resources/clique_leader.yaml to determine
if an ID space is a clique leader
"""
clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])
if self.testMode:
graph = self.testgraph
else:
graph = self.graph
filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport']
taxon_spec_filters = {
'10090': ['ENSEMBL']
}
if taxon in taxon_spec_filters:
filter_out += taxon_spec_filters[taxon]
model = Model(graph)
# deal with the xrefs
# MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
for ref in xrefs.strip().split('|'):
xref_curie = self._cleanup_id(ref)
if xref_curie is not None and xref_curie.strip() != '':
if re.match(r'HPRD', xref_curie):
# proteins are not == genes.
model.addTriple(
gene_id,
self.properties['has_gene_product'], xref_curie)
continue
# skip some of these for now
if xref_curie.split(':')[0] in filter_out:
continue
if re.match(r'^OMIM', xref_curie):
if DipperUtil.is_omim_disease(xref_curie):
continue
try:
if self.class_or_indiv.get(gene_id) == 'C':
model.addEquivalentClass(
gene_id, xref_curie)
if int(taxon) in clique_map:
if clique_map[int(taxon)] == xref_curie.split(':')[0]:
model.makeLeader(xref_curie)
elif clique_map[int(taxon)] == gene_id.split(':')[0]:
model.makeLeader(gene_id)
else:
model.addSameIndividual(gene_id, xref_curie)
except AssertionError as e:
logger.warn("Error parsing {0}: {1}".format(gene_id, e))
return
示例4: _process_trait_mappings
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _process_trait_mappings(self, raw, limit=None):
"""
This method mapps traits from/to ...
Triples created:
:param limit:
:return:
"""
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
line_counter = 0
model = Model(graph)
with open(raw, 'r') as csvfile:
filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
next(filereader, None) # skip header line
for row in filereader:
line_counter += 1
# need to skip the last line
if len(row) < 8:
LOG.info("skipping line %d: %s", line_counter, '\t'.join(row))
continue
(vto_id, pto_id, cmo_id, ato_column, species, trait_class,
trait_type, qtl_count) = row
ato_id = re.sub(
r'ATO #', 'AQTLTrait:', re.sub(
r'\].*', '', re.sub(r'\[', '', ato_column)))
ato_id = ato_id.strip()
ato_label = re.sub(r'.*\]\s*', '', ato_column)
model.addClassToGraph(ato_id, ato_label.strip())
if re.match(r'VT:.*', vto_id):
model.addClassToGraph(vto_id, None)
model.addEquivalentClass(ato_id, vto_id)
if re.match(r'LPT:.*', pto_id):
model.addClassToGraph(pto_id, None)
model.addXref(ato_id, pto_id)
if re.match(r'CMO:.*', cmo_id):
model.addClassToGraph(cmo_id, None)
model.addXref(ato_id, cmo_id)
LOG.info("Done with trait mappings")
return
示例5: _get_mapped_gene_ids
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _get_mapped_gene_ids(self, entry, g):
gene_ids = []
model = Model(g)
omimid = 'OMIM:'+str(entry['mimNumber'])
if 'externalLinks' in entry:
links = entry['externalLinks']
omimtype = self._get_omimtype(entry)
if 'geneIDs' in links:
entrez_mappings = links['geneIDs']
gene_ids = entrez_mappings.split(',')
self.omim_ncbigene_idmap[omimid] = gene_ids
if omimtype == Genotype.genoparts['gene']:
for i in gene_ids:
model.addEquivalentClass(omimid, 'NCBIGene:'+str(i))
return gene_ids
示例6: _process_genes_kegg2ncbi
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _process_genes_kegg2ncbi(self, limit=None):
"""
This method maps the KEGG human gene IDs
to the corresponding NCBI Gene IDs.
Triples created:
<kegg_gene_id> is a class
<ncbi_gene_id> is a class
<kegg_gene_id> equivalentClass <ncbi_gene_id>
:param limit:
:return:
"""
LOG.info("Processing KEGG gene IDs to NCBI gene IDs")
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
model = Model(graph)
raw = '/'.join((self.rawdir, self.files['ncbi']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in reader:
(kegg_gene_id, ncbi_gene_id, link_type) = row
if self.test_mode and kegg_gene_id not in self.test_ids['genes']:
continue
# Adjust the NCBI gene ID prefix.
ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id)
kegg_gene_id = 'KEGG-' + kegg_gene_id
# Adding the KEGG gene ID to the graph here is redundant,
# unless there happens to be additional gene IDs in this table
# not present in the genes table.
model.addClassToGraph(kegg_gene_id, None)
model.addClassToGraph(ncbi_gene_id, None)
model.addEquivalentClass(kegg_gene_id, ncbi_gene_id)
if not self.test_mode and (
limit is not None and reader.line_num > limit):
break
LOG.info("Done with KEGG gene IDs to NCBI gene IDs")
示例7: _process_diseasegene
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
#.........这里部分代码省略.........
config.get_config()['test_ids']['disease']:
continue
disorder_label = elem.find('Name').text
# make a hash of internal gene id to type for later lookup
gene_iid_to_type = {}
gene_list = elem.find('GeneList')
for gene in gene_list.findall('Gene'):
gene_iid = gene.get('id')
gene_type = gene.find('GeneType').get('id')
gene_iid_to_type[gene_iid] = gene_type
# assuming that these are in the ontology
model.addClassToGraph(disorder_id, disorder_label)
assoc_list = elem.find('DisorderGeneAssociationList')
for a in assoc_list.findall('DisorderGeneAssociation'):
gene_iid = a.find('.//Gene').get('id')
gene_name = a.find('.//Gene/Name').text
gene_symbol = a.find('.//Gene/Symbol').text
gene_num = a.find('./Gene/OrphaNumber').text
gene_id = 'Orphanet:'+str(gene_num)
gene_type_id = \
self._map_gene_type_id(gene_iid_to_type[gene_iid])
model.addClassToGraph(
gene_id, gene_symbol, gene_type_id, gene_name)
syn_list = a.find('./Gene/SynonymList')
if int(syn_list.get('count')) > 0:
for s in syn_list.findall('./Synonym'):
model.addSynonym(gene_id, s.text)
dgtype = a.find('DisorderGeneAssociationType').get('id')
rel_id = self._map_rel_id(dgtype)
dg_label = \
a.find('./DisorderGeneAssociationType/Name').text
if rel_id is None:
logger.warning(
"Cannot map association type (%s) to RO " +
"for association (%s | %s). Skipping.",
dg_label, disorder_label, gene_symbol)
continue
alt_locus_id = '_:'+gene_num+'-'+disorder_num+'VL'
alt_label = \
' '.join(('some variant of', gene_symbol.strip(),
'that is a', dg_label.lower(),
disorder_label))
model.addIndividualToGraph(alt_locus_id, alt_label,
geno.genoparts['variant_locus'])
geno.addAffectedLocus(alt_locus_id, gene_id)
model.addBlankNodeAnnotation(alt_locus_id)
# consider typing the gain/loss-of-function variants like:
# http://sequenceontology.org/browser/current_svn/term/SO:0002054
# http://sequenceontology.org/browser/current_svn/term/SO:0002053
# use "assessed" status to issue an evidence code
# FIXME I think that these codes are sub-optimal
status_code = \
a.find('DisorderGeneAssociationStatus').get('id')
# imported automatically asserted information
# used in automatic assertion
eco_id = 'ECO:0000323'
# Assessed
# TODO are these internal ids stable between releases?
if status_code == '17991':
# imported manually asserted information
# used in automatic assertion
eco_id = 'ECO:0000322'
# Non-traceable author statement ECO_0000034
# imported information in automatic assertion ECO_0000313
assoc = G2PAssoc(g, self.name, alt_locus_id,
disorder_id, rel_id)
assoc.add_evidence(eco_id)
assoc.add_association_to_graph()
rlist = a.find('./Gene/ExternalReferenceList')
eqid = None
for r in rlist.findall('ExternalReference'):
if r.find('Source').text == 'Ensembl':
eqid = 'ENSEMBL:'+r.find('Reference').text
elif r.find('Source').text == 'HGNC':
eqid = 'HGNC:'+r.find('Reference').text
elif r.find('Source').text == 'OMIM':
eqid = 'OMIM:'+r.find('Reference').text
else:
pass # skip the others for now
if eqid is not None:
model.addClassToGraph(eqid, None)
model.addEquivalentClass(gene_id, eqid)
elem.clear() # empty the element
if self.testMode and limit is not None and line_counter > limit:
return
return
示例8: _process_diseasegene
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _process_diseasegene(self, limit):
"""
:param limit:
:return:
"""
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
line_counter = 0
model = Model(graph)
myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))
for event, elem in ET.iterparse(myfile):
if elem.tag == 'Disorder':
# get the element name and id, ignore element name
# id = elem.get('id') # some internal identifier
disorder_num = elem.find('OrphaNumber').text
disorder_id = 'ORPHA:' + str(disorder_num)
if self.test_mode and disorder_id not in self.all_test_ids['disease']:
continue
disorder_label = elem.find('Name').text
# assuming that these are in the ontology (...any particular one?)
model.addClassToGraph(disorder_id, disorder_label)
assoc_list = elem.find('DisorderGeneAssociationList')
expected_genes = assoc_list.get('count')
LOG.info(
'Expecting %s genes associated with disorder %s.',
expected_genes, disorder_id)
processed_genes = 0
for assoc in assoc_list.findall('DisorderGeneAssociation'):
processed_genes += 1
gene = assoc.find('Gene')
# get gene's curie HGNC or Ensembl ...
lclid = gene.find('OrphaNumber').text
gene_curie = 'ORPHA:' + lclid
gene_set = {'ORPHA': lclid}
for gene_ref in gene.findall(
'./ExternalReferenceList/ExternalReference'):
gene_set[gene_ref.find('Source').text] = \
gene_ref.find('Reference').text
# set priority (clique leader if available) but default to OPRHA
for pfx in ('HGNC', 'Ensembl', 'SwissProt'):
if pfx in gene_set:
if pfx in self.localtt:
pfx = self.localtt[pfx]
gene_curie = pfx + ':' + gene_set[pfx]
gene_set.pop(pfx)
model.addClassToGraph(gene_curie, None)
break
# TEC have reservations w.r.t aggerator links being gene classes
for prefix in gene_set:
lclid = gene_set[prefix]
if prefix in self.localtt:
prefix = self.localtt[prefix]
dbxref = prefix + ':' + lclid
if gene_curie != dbxref:
model.addClassToGraph(dbxref, None)
model.addEquivalentClass(gene_curie, dbxref)
# TEC. would prefer this not happen here. let HGNC handle it
# except there are some w/o explicit external links ...
gene_symbol = gene.find('Symbol').text
syn_list = gene.find('./SynonymList')
if int(syn_list.get('count')) > 0:
for syn in syn_list.findall('./Synonym'):
model.addSynonym(gene_curie, syn.text)
dg_label = assoc.find('./DisorderGeneAssociationType/Name').text
# use dg association status to issue an evidence code
# FIXME I think that these codes are sub-optimal
eco_id = self.resolve(
assoc.find('DisorderGeneAssociationStatus/Name').text)
rel_id = self.resolve(dg_label)
g2p_assoc = G2PAssoc(self.graph, self.name, gene_curie, disorder_id, rel_id)
g2p_assoc.add_evidence(eco_id)
g2p_assoc.add_association_to_graph()
elem.clear() # empty the element
if int(expected_genes) != processed_genes:
LOG.warning(
'% expected %s associated genes but we processed %i',
disorder_id, expected_genes, processed_genes)
if self.test_mode and limit is not None and line_counter > limit:
#.........这里部分代码省略.........
示例9: _process_genes
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _process_genes(self, limit=None):
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
geno = Genotype(graph)
model = Model(graph)
raw = '/'.join((self.rawdir, self.files['genes']['file']))
col = self.files['genes']['columns']
LOG.info("Processing HGNC genes")
chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]')
band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)')
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
row = next(filereader)
if not self.check_fileheader(col, row):
exit(-1)
for row in filereader:
# To generate:
# head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' |
# sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g"
hgnc_id = row[col.index('hgnc_id')].strip()
symbol = row[col.index('symbol')].strip()
name = row[col.index('name')].strip()
# locus_group = row[col.index('locus_group')]
locus_type = row[col.index('locus_type')].strip()
# status = row[col.index('status')]
location = row[col.index('location')].strip()
# location_sortable = row[col.index('location_sortable')]
# alias_symbol = row[col.index('alias_symbol')]
# alias_name = row[col.index('alias_name')]
# prev_symbol = row[col.index('prev_symbol')]
# prev_name = row[col.index('prev_name')]
# gene_family = row[col.index('gene_family')]
# gene_family_id = row[col.index('gene_family_id')]
# date_approved_reserved = row[col.index('date_approved_reserved')]
# date_symbol_changed = row[col.index('date_symbol_changed')]
# date_name_changed = row[col.index('date_name_changed')]
# date_modified = row[col.index('date_modified')]
entrez_id = row[col.index('entrez_id')].strip()
ensembl_gene_id = row[col.index('ensembl_gene_id')].strip()
# vega_id = row[col.index('vega_id')]
# ucsc_id = row[col.index('ucsc_id')]
# ena = row[col.index('ena')]
# refseq_accession = row[col.index('refseq_accession')]
# ccds_id = row[col.index('ccds_id')]
# uniprot_ids = row[col.index('uniprot_ids')]
pubmed_ids = row[col.index('pubmed_id')].strip() # pipe seperated!
# mgd_id = row[col.index('mgd_id')]
# rgd_id = row[col.index('rgd_id')]
# lsdb = row[col.index('lsdb')]
# cosmic = row[col.index('cosmic')]
omim_ids = row[col.index('omim_id')].strip() # pipe seperated!
# mirbase = row[col.index('mirbase')]
# homeodb = row[col.index('homeodb')]
# snornabase = row[col.index('snornabase')]
# bioparadigms_slc = row[col.index('bioparadigms_slc')]
# orphanet = row[col.index('orphanet')]
# pseudogene.org = row[col.index('pseudogene.org')]
# horde_id = row[col.index('horde_id')]
# merops = row[col.index('merops')]
# imgt = row[col.index('imgt')]
# iuphar = row[col.index('iuphar')]
# kznf_gene_catalog = row[col.index('kznf_gene_catalog')]
# mamit_trnadb = row[col.index('mamit-trnadb')]
# cd = row[col.index('cd')]
# lncrnadb = row[col.index('lncrnadb')]
# enzyme_id = row[col.index('enzyme_id')]
# intermediate_filament_db = row[col.index('intermediate_filament_db')]
# rna_central_ids = row[col.index('rna_central_ids')]
# lncipedia = row[col.index('lncipedia')]
# gtrnadb = row[col.index('gtrnadb')]
if self.test_mode and entrez_id != '' and \
entrez_id not in self.gene_ids:
continue
if name == '':
name = None
if locus_type == 'withdrawn':
model.addDeprecatedClass(hgnc_id)
else:
gene_type_id = self.resolve(locus_type, False) # withdrawn -> None?
if gene_type_id != locus_type:
model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
model.makeLeader(hgnc_id)
if entrez_id != '':
model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)
if ensembl_gene_id != '':
#.........这里部分代码省略.........
示例10: _process_genes
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _process_genes(self, taxid, limit=None):
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
model = Model(graph)
geno = Genotype(graph)
raw = '/'.join((self.rawdir, self.files[taxid]['file']))
line_counter = 0
LOG.info("Processing Ensembl genes for tax %s", taxid)
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t')
for row in filereader:
if len(row) < 4:
LOG.warning("Too few columns in: " + row)
raise ValueError("Data error for file %s", raw)
(ensembl_gene_id, external_gene_name, description, gene_biotype,
entrezgene, ensembl_peptide_id, uniprotswissprot) = row[0:7]
# in the case of human genes, we also get the hgnc id,
# and is the last col
if taxid == '9606':
hgnc_id = row[7]
else:
hgnc_id = None
if self.test_mode and entrezgene != '' and \
int(entrezgene) not in self.gene_ids:
continue
line_counter += 1
gene_id = 'ENSEMBL:' + ensembl_gene_id
peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id)
uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot)
entrez_curie = 'NCBIGene:{}'.format(entrezgene)
if description == '':
description = None
gene_biotype = gene_biotype.strip()
gene_type_id = self.resolve(gene_biotype, False)
if gene_type_id == gene_biotype.strip(): # did not resolve
gene_type_id = self.globaltt['polypeptide']
model.addClassToGraph(
gene_id, external_gene_name, gene_type_id, description)
model.addIndividualToGraph(peptide_curie, None, gene_type_id)
model.addIndividualToGraph(uniprot_curie, None, gene_type_id)
if entrezgene != '':
if taxid == '9606':
# Use HGNC for eq in human data
model.addXref(gene_id, entrez_curie)
else:
model.addEquivalentClass(gene_id, entrez_curie)
if hgnc_id is not None and hgnc_id != '':
model.addEquivalentClass(gene_id, hgnc_id)
geno.addTaxon('NCBITaxon:'+taxid, gene_id)
if ensembl_peptide_id != '':
geno.addGeneProduct(gene_id, peptide_curie)
if uniprotswissprot != '':
geno.addGeneProduct(gene_id, uniprot_curie)
model.addXref(peptide_curie, uniprot_curie)
if not self.test_mode and limit is not None and line_counter > limit:
break
return
示例11: _get_identifiers
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _get_identifiers(self, limit):
"""
This will process the id mapping file provided by Biogrid.
The file has a very large header, which we scan past,
then pull the identifiers, and make equivalence axioms
:param limit:
:return:
"""
logger.info("getting identifier mapping")
line_counter = 0
f = '/'.join((self.rawdir, self.files['identifiers']['file']))
myzip = ZipFile(f, 'r')
# assume that the first entry is the item
fname = myzip.namelist()[0]
foundheader = False
# TODO align this species filter with the one above
# speciesfilters = 'Homo sapiens,Mus musculus,Drosophila melanogaster,
# Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',')
speciesfilters = 'Homo sapiens,Mus musculus'.split(',')
with myzip.open(fname, 'r') as csvfile:
for line in csvfile:
# skip header lines
if not foundheader:
if re.match(r'BIOGRID_ID', line.decode()):
foundheader = True
continue
line = line.decode().strip()
# BIOGRID_ID
# IDENTIFIER_VALUE
# IDENTIFIER_TYPE
# ORGANISM_OFFICIAL_NAME
# 1 814566 ENTREZ_GENE Arabidopsis thaliana
(biogrid_num, id_num, id_type,
organism_label) = line.split('\t')
if self.testMode:
g = self.testgraph
# skip any genes that don't match our test set
if int(biogrid_num) not in self.biogrid_ids:
continue
else:
g = self.graph
model = Model(g)
# for each one of these,
# create the node and add equivalent classes
biogrid_id = 'BIOGRID:'+biogrid_num
prefix = self._map_idtype_to_prefix(id_type)
# TODO make these filters available as commandline options
# geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,
# WormBase,XenBase,ENSEMBL,miRBase'.split(',')
geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',')
# proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
if (speciesfilters is not None) \
and (organism_label.strip() in speciesfilters):
line_counter += 1
if (geneidtypefilters is not None) \
and (prefix in geneidtypefilters):
mapped_id = ':'.join((prefix, id_num))
model.addEquivalentClass(biogrid_id, mapped_id)
# this symbol will only get attached to the biogrid class
elif id_type == 'OFFICIAL_SYMBOL':
model.addClassToGraph(biogrid_id, id_num)
# elif (id_type == 'SYNONYM'):
# FIXME - i am not sure these are synonyms, altids?
# gu.addSynonym(g,biogrid_id,id_num)
if not self.testMode and limit is not None \
and line_counter > limit:
break
myzip.close()
return
示例12: _process_omim2disease
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _process_omim2disease(self, limit=None):
"""
This method maps the KEGG disease IDs to
the corresponding OMIM disease IDs.
Currently this only maps KEGG diseases and OMIM diseases that are 1:1.
Triples created:
<kegg_disease_id> is a class
<omim_disease_id> is a class
<kegg_disease_id> hasXref <omim_disease_id>
:param limit:
:return:
"""
LOG.info("Processing 1:1 KEGG disease to OMIM disease mappings")
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
model = Model(graph)
raw = '/'.join((self.rawdir, self.files['omim2disease']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in reader:
(omim_disease_id, kegg_disease_id, link_type) = row
kegg_disease_id = 'KEGG-' + kegg_disease_id.strip()
omim_disease_id = re.sub(r'omim', 'OMIM', omim_disease_id)
# Create hash for the links from OMIM ID -> KEGG ID
if omim_disease_id not in self.omim_disease_hash:
self.omim_disease_hash[omim_disease_id] = [kegg_disease_id]
else:
self.omim_disease_hash[omim_disease_id].append(kegg_disease_id)
# Create hash for the links from KEGG ID -> OMIM ID
if kegg_disease_id not in self.kegg_disease_hash:
self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id]
else:
self.kegg_disease_hash[kegg_disease_id].append(omim_disease_id)
# Now process the disease hashes
# and only pass 1:1 omim disease:KEGG disease entries.
for omim_disease_id in self.omim_disease_hash:
if self.test_mode and omim_disease_id not in self.test_ids['disease']:
continue
if (not self.test_mode) and (limit is not None and reader.line_num > limit):
break
if len(self.omim_disease_hash[omim_disease_id]) == 1:
kegg_disease_id = ''.join(self.omim_disease_hash.get(omim_disease_id))
if len(self.kegg_disease_hash[kegg_disease_id]) == 1:
# add ids, and deal with the labels separately
model.addClassToGraph(kegg_disease_id, None)
model.addClassToGraph(omim_disease_id, None)
# TODO is this safe?
model.addEquivalentClass(kegg_disease_id, omim_disease_id)
else:
pass
# gu.addXref(g, omim_disease_id, kegg_disease_id)
# TODO add xrefs if >1:1 mapping?
LOG.info("Done with KEGG disease to OMIM disease mappings.")
示例13: _process_omim2gene
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _process_omim2gene(self, limit=None):
"""
This method maps the OMIM IDs and KEGG gene ID.
Currently split based on the link_type field.
Equivalent link types are mapped as gene XRefs.
Reverse link types are mapped as disease to gene associations.
Original link types are currently skipped.
Triples created:
<kegg_gene_id> is a Gene
<omim_gene_id> is a Gene
<kegg_gene_id>> hasXref <omim_gene_id>
<assoc_id> has subject <omim_disease_id>
<assoc_id> has object <kegg_gene_id>
:param limit:
:return:
"""
LOG.info("Processing OMIM to KEGG gene")
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
model = Model(graph)
geno = Genotype(graph)
raw = '/'.join((self.rawdir, self.files['omim2gene']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in reader:
(kegg_gene_id, omim_id, link_type) = row
if self.test_mode and kegg_gene_id not in self.test_ids['genes']:
continue
kegg_gene_id = 'KEGG-' + kegg_gene_id.strip()
omim_id = re.sub(r'omim', 'OMIM', omim_id)
if link_type == 'equivalent':
# these are genes!
# so add them as a class then make equivalence
model.addClassToGraph(omim_id, None)
geno.addGene(kegg_gene_id, None)
# previous: if omim type is not disease-ish then use
# now is: if omim type is gene then use
if omim_id in self.omim_replaced:
repl = self.omim_replaced[omim_id]
for omim in repl:
if omim in self.omim_type and \
self.omim_type[omim] == self.globaltt['gene']:
omim_id = omim
if omim_id in self.omim_type and \
self.omim_type[omim_id] == self.globaltt['gene']:
model.addEquivalentClass(kegg_gene_id, omim_id)
elif link_type == 'reverse':
# make an association between an OMIM ID & the KEGG gene ID
# we do this with omim ids because
# they are more atomic than KEGG ids
alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id)
alt_label = self.label_hash[alt_locus_id]
model.addIndividualToGraph(
alt_locus_id, alt_label, self.globaltt['variant_locus'])
geno.addAffectedLocus(alt_locus_id, kegg_gene_id)
model.addBlankNodeAnnotation(alt_locus_id)
# Add the disease to gene relationship.
rel = self.globaltt['is marker for']
assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel)
assoc.add_association_to_graph()
elif link_type == 'original':
# these are sometimes a gene, and sometimes a disease
LOG.info(
'Unable to handle original link for %s-%s',
kegg_gene_id, omim_id)
else:
# don't know what these are
LOG.warning(
'Unhandled link type for %s-%s: %s',
kegg_gene_id, omim_id, link_type)
if (not self.test_mode) and (
limit is not None and reader.line_num > limit):
break
LOG.info("Done with OMIM to KEGG gene")
示例14: _process_genes
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _process_genes(self, limit=None):
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
model = Model(g)
raw = '/'.join((self.rawdir, self.files['genes']['file']))
line_counter = 0
logger.info("Processing HGNC genes")
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
# curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n .
for row in filereader:
(hgnc_id,
symbol,
name,
locus_group,
locus_type,
status,
location,
location_sortable,
alias_symbol,
alias_name,
prev_symbol,
prev_name,
gene_family,
gene_family_id,
date_approved_reserved,
date_symbol_changed,
date_name_changed,
date_modified,
entrez_id,
ensembl_gene_id,
vega_id,
ucsc_id,
ena,
refseq_accession,
ccds_id,
uniprot_ids,
pubmed_id,
mgd_id,
rgd_id,
lsdb,
cosmic,
omim_id,
mirbase,
homeodb,
snornabase,
bioparadigms_slc,
orphanet,
pseudogene_org,
horde_id,
merops,
imgt,
iuphar,
kznf_gene_catalog,
mamit_trnadb,
cd,
lncrnadb,
enzyme_id,
intermediate_filament_db,
rna_central_ids) = row
line_counter += 1
# skip header
if line_counter <= 1:
continue
if self.testMode and entrez_id != '' \
and int(entrez_id) not in self.gene_ids:
continue
if name == '':
name = None
gene_type_id = self._get_gene_type(locus_type)
model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
if locus_type == 'withdrawn':
model.addDeprecatedClass(hgnc_id)
else:
model.makeLeader(hgnc_id)
if entrez_id != '':
model.addEquivalentClass(
hgnc_id, 'NCBIGene:' + entrez_id)
if ensembl_gene_id != '':
model.addEquivalentClass(
hgnc_id, 'ENSEMBL:' + ensembl_gene_id)
if omim_id != '' and "|" not in omim_id:
omim_curie = 'OMIM:' + omim_id
if not DipperUtil.is_omim_disease(omim_curie):
model.addEquivalentClass(hgnc_id, omim_curie)
geno.addTaxon('NCBITaxon:9606', hgnc_id)
# add pubs as "is about"
if pubmed_id != '':
#.........这里部分代码省略.........
示例15: _process_genes
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addEquivalentClass [as 别名]
def _process_genes(self, taxid, limit=None):
if self.testMode:
g = self.testgraph
else:
g = self.graph
model = Model(g)
geno = Genotype(g)
raw = '/'.join((self.rawdir, self.files[taxid]['file']))
line_counter = 0
logger.info("Processing Ensembl genes for tax %s", taxid)
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t')
for row in filereader:
if len(row) < 4:
raise ValueError("Data error for file %s", raw)
(ensembl_gene_id, external_gene_name,
description, gene_biotype, entrezgene,
peptide_id, uniprot_swissprot) = row[0:7]
# in the case of human genes, we also get the hgnc id,
# and is the last col
if taxid == '9606':
hgnc_id = row[7]
else:
hgnc_id = None
if self.testMode and entrezgene != '' \
and int(entrezgene) not in self.gene_ids:
continue
line_counter += 1
gene_id = 'ENSEMBL:' + ensembl_gene_id
peptide_curie = 'ENSEMBL:{}'.format(peptide_id)
uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot)
entrez_curie = 'NCBIGene:{}'.format(entrezgene)
if description == '':
description = None
# gene_type_id = self._get_gene_type(gene_biotype)
gene_type_id = None
model.addClassToGraph(
gene_id, external_gene_name, gene_type_id, description)
model.addIndividualToGraph(peptide_curie, None, self._get_gene_type("polypeptide"))
model.addIndividualToGraph(uniprot_curie, None, self._get_gene_type("polypeptide"))
if entrezgene != '':
model.addEquivalentClass(gene_id, entrez_curie)
if hgnc_id is not None and hgnc_id != '':
model.addEquivalentClass(gene_id, hgnc_id)
geno.addTaxon('NCBITaxon:'+taxid, gene_id)
if peptide_id != '':
geno.addGeneProduct(gene_id, peptide_curie)
if uniprot_swissprot != '':
geno.addGeneProduct(gene_id, uniprot_curie)
model.addXref(peptide_curie, uniprot_curie)
if not self.testMode \
and limit is not None and line_counter > limit:
break
return