本文整理汇总了Python中dipper.utils.GraphUtils.GraphUtils.addSynonym方法的典型用法代码示例。如果您正苦于以下问题:Python GraphUtils.addSynonym方法的具体用法?Python GraphUtils.addSynonym怎么用?Python GraphUtils.addSynonym使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.utils.GraphUtils.GraphUtils
的用法示例。
在下文中一共展示了GraphUtils.addSynonym方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_gene_history
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
def _get_gene_history(self, limit):
"""
Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new
gene id is the replacement for it. The old gene symbol is added as a synonym to the gene.
:param limit:
:return:
"""
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
logger.info("Processing Gene records")
line_counter = 0
myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
logger.info("FILE: %s", myfile)
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match('^#', line):
continue
(tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t')
##### set filter=None in init if you don't want to have a filter
#if self.filter is not None:
# if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
# or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
# continue
##### end filter
if gene_num == '-' or discontinued_num == '-':
continue
if self.testMode and int(gene_num) not in self.gene_ids:
continue
if int(tax_num) not in self.tax_ids:
continue
line_counter += 1
gene_id = ':'.join(('NCBIGene', gene_num))
discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
tax_id = ':'.join(('NCBITaxon', tax_num))
# add the two genes
gu.addClassToGraph(g, gene_id, None)
gu.addClassToGraph(g, discontinued_gene_id, discontinued_symbol)
# add the new gene id to replace the old gene id
gu.addDeprecatedClass(g, discontinued_gene_id, [gene_id])
# also add the old symbol as a synonym of the new gene
gu.addSynonym(g, gene_id, discontinued_symbol)
if (not self.testMode) and (limit is not None and line_counter > limit):
break
return
示例2: _process_straininfo
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
def _process_straininfo(self, limit):
# line_counter = 0 # TODO unused
if self.testMode:
g = self.testgraph
else:
g = self.graph
logger.info("Processing measurements ...")
raw = '/'.join((self.rawdir, self.files['straininfo']['file']))
tax_id = 'NCBITaxon:10090'
gu = GraphUtils(curie_map.get())
with open(raw, 'r') as f:
reader = csv.reader(f, delimiter=',', quotechar='\"')
f.readline() # read the header row; skip
for row in reader:
(strain_name, vendor, stocknum, panel, mpd_strainid,
straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
# C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
# create the strain as an instance of the taxon
if self.testMode and \
'MPD:'+str(mpd_strainid) not in self.test_ids:
continue
strain_id = 'MPD-strain:'+str(mpd_strainid)
gu.addIndividualToGraph(g, strain_id, strain_name, tax_id)
if mpdshortname.strip() != '':
gu.addSynonym(g, strain_id, mpdshortname.strip())
self.idlabel_hash[strain_id] = strain_name
# make it equivalent to the vendor+stock
if stocknum != '':
if vendor == 'J':
jax_id = 'JAX:'+stocknum
gu.addSameIndividual(g, strain_id, jax_id)
elif vendor == 'Rbrc':
# reiken
reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
gu.addSameIndividual(g, strain_id, reiken_id)
else:
if url != '':
gu.addXref(g, strain_id, url, True)
if vendor != '':
gu.addXref(
g, strain_id, ':'.join((vendor, stocknum)),
True)
# add the panel information
if panel != '':
desc = panel+' [panel]'
gu.addDescription(g, strain_id, desc)
# TODO make the panels as a resource collection
return
示例3: _process_ortholog_classes
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
def _process_ortholog_classes(self, limit=None):
"""
This method add the KEGG orthology classes to the graph.
Triples created:
<orthology_class_id> is a class
<orthology_class_id> has label <orthology_symbols>
<orthology_class_id> has description <orthology_description>
:param limit:
:return:
"""
logger.info("Processing ortholog classes")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
gu = GraphUtils(curie_map.get())
raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(orthology_class_id, orthology_class_name) = row
if self.testMode and orthology_class_id not in self.test_ids['ortholog_classes']:
continue
# FIXME: What's the proper route for this?
# The orthology class is essentially a KEGG gene ID that is species agnostic.
# Add the ID and label as a class. Would it be considered a gene as well?
other_labels = re.split(';', orthology_class_name)
orthology_label = other_labels[0] # the first one is the label we'll use
orthology_class_id = 'KEGG-'+orthology_class_id.strip()
orthology_type = OrthologyAssoc.terms['gene_family']
gu.addClassToGraph(g, orthology_class_id, orthology_label, orthology_type)
if len(other_labels) > 1:
# add the rest as synonyms
# todo skip the first
for s in other_labels:
gu.addSynonym(g, orthology_class_id, s)
# add the last one as the description
gu.addDescription(g, orthology_class_id, other_labels[len(other_labels)-1])
if (not self.testMode) and (limit is not None and line_counter > limit):
break
logger.info("Done with ortholog classes")
return
示例4: process_gene_ids
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
def process_gene_ids(self, limit):
raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))
if self.testMode:
g = self.testgraph
else:
g = self.graph
gu = GraphUtils(curie_map.get())
logger.info("Processing Gene IDs")
line_counter = 0
geno = Genotype(g)
with gzip.open(raw, 'rb') as csvfile:
filereader = csv.reader(
io.TextIOWrapper(csvfile, newline=""), delimiter=',',
quotechar='\"')
for row in filereader:
line_counter += 1
(taxon_num, gene_num, gene_symbol, gene_synonym, live) = row
# 6239,WBGene00000001,aap-1,Y110A7A.10,Live
if self.testMode and gene_num not in self.test_ids['gene']:
continue
taxon_id = 'NCBITaxon:'+taxon_num
gene_id = 'WormBase:'+gene_num
if gene_symbol == '':
gene_symbol = gene_synonym
if gene_symbol == '':
gene_symbol = None
gu.addClassToGraph(
g, gene_id, gene_symbol, Genotype.genoparts['gene'])
if live == 'Dead':
gu.addDeprecatedClass(g, gene_id)
geno.addTaxon(taxon_id, gene_id)
if gene_synonym != '':
gu.addSynonym(g, gene_id, gene_synonym)
if not self.testMode \
and limit is not None and line_counter > limit:
break
return
示例5: _get_titles
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
def _get_titles(self, limit):
"""
The file processed here is of the format:
#NBK_id GR_shortname OMIM
NBK1103 trimethylaminuria 136132
NBK1103 trimethylaminuria 602079
NBK1104 cdls 122470
Where each of the rows represents a mapping between
a gr id and an omim id. These are a 1:many relationship,
and some of the omim ids are genes (not diseases).
Therefore, we need to create a loose coupling here.
We make the assumption that these NBKs are generally higher-level
grouping classes; therefore the OMIM ids are treated as subclasses.
(This assumption is poor for those omims that are actually genes,
but we have no way of knowing what those are here...
we will just have to deal with that for now.)
:param limit:
:return:
"""
raw = '/'.join((self.rawdir, self.files['titles']['file']))
gu = GraphUtils(curie_map.get())
line_counter = 0
with open(raw, 'r', encoding='latin-1') as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
if line_counter == 1: # skip header
continue
(shortname, title, nbk_num) = row
gr_id = 'GeneReviews:'+nbk_num
self.book_ids.add(nbk_num) # a global set of the book nums
if limit is None or line_counter < limit:
gu.addClassToGraph(self.graph, gr_id, title)
gu.addSynonym(self.graph, gr_id, shortname)
return
示例6: _get_gene_info
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
def _get_gene_info(self, limit):
"""
Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their
label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as
protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located
on the chr band.
:param limit:
:return:
"""
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
# not unzipping the file
logger.info("Processing Gene records")
line_counter = 0
myfile = '/'.join((self.rawdir, self.files['gene_info']['file']))
logger.info("FILE: %s", myfile)
# Add taxa and genome classes for those in our filter
for tax_num in self.tax_ids:
tax_id = ':'.join(('NCBITaxon', str(tax_num)))
geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere
gu.addClassToGraph(g, tax_id, None) # label added elsewhere
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match('^#', line):
continue
(tax_num, gene_num, symbol, locustag,
synonyms, xrefs, chr, map_loc, desc,
gtype, authority_symbol, name,
nomenclature_status, other_designations, modification_date) = line.split('\t')
##### set filter=None in init if you don't want to have a filter
#if self.filter is not None:
# if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
# or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
# continue
##### end filter
if self.testMode and int(gene_num) not in self.gene_ids:
continue
if int(tax_num) not in self.tax_ids:
continue
line_counter += 1
gene_id = ':'.join(('NCBIGene', gene_num))
tax_id = ':'.join(('NCBITaxon', tax_num))
gene_type_id = self._map_type_of_gene(gtype)
if symbol == 'NEWENTRY':
label = None
else:
label = symbol
# TODO might have to figure out if things aren't genes, and make them individuals
gu.addClassToGraph(g, gene_id, label, gene_type_id, desc)
# we have to do special things here for genes, because they're classes not individuals
# f = Feature(gene_id,label,gene_type_id,desc)
if name != '-':
gu.addSynonym(g, gene_id, name)
if synonyms.strip() != '-':
for s in synonyms.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
if other_designations.strip() != '-':
for s in other_designations.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
# deal with the xrefs
# MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
if xrefs.strip() != '-':
for r in xrefs.strip().split('|'):
fixedr = self._cleanup_id(r)
if fixedr is not None and fixedr.strip() != '':
if re.match('HPRD', fixedr):
# proteins are not == genes.
gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr)
else:
# skip some of these for now
if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']:
gu.addEquivalentClass(g, gene_id, fixedr)
# edge cases of id | symbol | chr | map_loc:
# 263 AMD1P2 X|Y with Xq28 and Yq12
# 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR
# 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies
# 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR
# 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility
# 101928066 LOC101928066 1|Un - # unlocated scaffold
#.........这里部分代码省略.........
示例7: Monochrom
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
#.........这里部分代码省略.........
for taxon in self.tax_ids:
self._get_chrbands(limit, str(taxon))
self.load_core_bindings()
self.load_bindings()
# using the full graph as the test here
self.testgraph = self.graph
logger.info("Found %d nodes", len(self.graph))
logger.info("Done parsing files.")
return
def _get_chrbands(self, limit, taxon):
"""
For the given taxon, it will fetch the chr band file.
We will not deal with the coordinate information with this parser.
Here, we only are concerned with building the partonomy.
:param limit:
:return:
"""
line_counter = 0
myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
logger.info("Processing Chr bands from FILE: %s", myfile)
geno = Genotype(self.graph)
# build the organism's genome from the taxon
genome_label = self.files[taxon]['genome_label']
taxon_id = 'NCBITaxon:'+taxon
# add the taxon as a class. adding the class label elsewhere
self.gu.addClassToGraph(self.graph, taxon_id, None)
self.gu.addSynonym(self.graph, taxon_id, genome_label)
self.gu.loadObjectProperties(self.graph, Feature.object_properties)
genome_id = geno.makeGenomeID(taxon_id)
geno.addGenome(taxon_id, genome_label)
self.gu.addOWLPropertyClassRestriction(
self.graph, genome_id, Genotype.object_properties['in_taxon'],
taxon_id)
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match(r'^#', line):
continue
# chr13 4500000 10000000 p12 stalk
(chrom, start, stop, band, rtype) = line.split('\t')
line_counter += 1
# NOTE
# some less-finished genomes have placed and unplaced scaffolds
# * Placed scaffolds:
# Scaffold has an oriented location within a chromosome.
# * Unlocalized scaffolds:
# scaffold 's chromosome is known,
# scaffold's position, orientation or both is not known.
# *Unplaced scaffolds:
# it is not known which chromosome the scaffold belongs to.
# find out if the thing is a full on chromosome, or a scaffold:
# ex: unlocalized scaffold: chr10_KL568008v1_random
示例8: _process_diseasegene
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
def _process_diseasegene(self, limit):
"""
:param limit:
:return:
"""
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
geno = Genotype(g)
gu = GraphUtils(curie_map.get())
myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))
# PYLINT complains iterparse deprecated,
# but as of py 3.4 only the optional & unsupplied parse arg is.
for event, elem in ET.iterparse(myfile):
if elem.tag == 'Disorder':
# get the element name and id, ignoreS element name
# id = elem.get('id') # some internal identifier
disorder_num = elem.find('OrphaNumber').text
disorder_id = 'Orphanet:'+str(disorder_num)
if self.testMode and \
disorder_id not in \
config.get_config()['test_ids']['disease']:
continue
disorder_label = elem.find('Name').text
# make a hash of internal gene id to type for later lookup
gene_iid_to_type = {}
gene_list = elem.find('GeneList')
for gene in gene_list.findall('Gene'):
gene_iid = gene.get('id')
gene_type = gene.find('GeneType').get('id')
gene_iid_to_type[gene_iid] = gene_type
# assuming that these are in the ontology
gu.addClassToGraph(g, disorder_id, disorder_label)
assoc_list = elem.find('DisorderGeneAssociationList')
for a in assoc_list.findall('DisorderGeneAssociation'):
gene_iid = a.find('.//Gene').get('id')
gene_name = a.find('.//Gene/Name').text
gene_symbol = a.find('.//Gene/Symbol').text
gene_num = a.find('./Gene/OrphaNumber').text
gene_id = 'Orphanet:'+str(gene_num)
gene_type_id = \
self._map_gene_type_id(gene_iid_to_type[gene_iid])
gu.addClassToGraph(
g, gene_id, gene_symbol, gene_type_id, gene_name)
syn_list = a.find('./Gene/SynonymList')
if int(syn_list.get('count')) > 0:
for s in syn_list.findall('./Synonym'):
gu.addSynonym(g, gene_id, s.text)
dgtype = a.find('DisorderGeneAssociationType').get('id')
rel_id = self._map_rel_id(dgtype)
dg_label = \
a.find('./DisorderGeneAssociationType/Name').text
if rel_id is None:
logger.warning(
"Cannot map association type (%s) to RO " +
"for association (%s | %s). Skipping.",
dg_label, disorder_label, gene_symbol)
continue
alt_locus_id = '_'+gene_num+'-'+disorder_num+'VL'
alt_label = \
' '.join(('some variant of', gene_symbol.strip(),
'that is a', dg_label.lower(),
disorder_label))
if self.nobnodes:
alt_locus_id = ':'+alt_locus_id
gu.addIndividualToGraph(g, alt_locus_id, alt_label,
geno.genoparts['variant_locus'])
geno.addAlleleOfGene(alt_locus_id, gene_id)
# consider typing the gain/loss-of-function variants like:
# http://sequenceontology.org/browser/current_svn/term/SO:0002054
# http://sequenceontology.org/browser/current_svn/term/SO:0002053
# use "assessed" status to issue an evidence code
# FIXME I think that these codes are sub-optimal
status_code = \
a.find('DisorderGeneAssociationStatus').get('id')
# imported automatically asserted information
# used in automatic assertion
eco_id = 'ECO:0000323'
# Assessed
# TODO are these internal ids stable between releases?
if status_code == '17991':
# imported manually asserted information
# used in automatic assertion
eco_id = 'ECO:0000322'
# Non-traceable author statement ECO_0000034
# imported information in automatic assertion ECO_0000313
#.........这里部分代码省略.........
示例9: OMIA
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
#.........这里部分代码省略.........
for event, elem in ET.iterparse(filereader):
self.process_xml_table(
elem, 'Article_Breed', self._process_article_breed_row, limit)
self.process_xml_table(
elem, 'Article_Phene', self._process_article_phene_row, limit)
self.process_xml_table(
elem, 'Breed_Phene', self._process_breed_phene_row, limit)
self.process_xml_table(
elem, 'Lida_Links', self._process_lida_links_row, limit)
self.process_xml_table(
elem, 'Phene_Gene', self._process_phene_gene_row, limit)
self.process_xml_table(
elem, 'Group_MPO', self._process_group_mpo_row, limit)
f.close()
return
# ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################
def _process_species_table_row(self, row):
# gb_species_id, sci_name, com_name, added_by, date_modified
tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
sci_name = row['sci_name']
com_name = row['com_name']
if self.testMode and \
(int(row['gb_species_id']) not in self.test_ids['taxon']):
return
self.gu.addClassToGraph(self.g, tax_id, sci_name)
if com_name != '':
self.gu.addSynonym(self.g, tax_id, com_name)
self.label_hash[tax_id] = com_name # for lookup later
else:
self.label_hash[tax_id] = sci_name
return
def _process_breed_row(self, row):
# in test mode, keep all breeds of our test species
if self.testMode and \
(int(row['gb_species_id']) not in self.test_ids['taxon']):
return
# save the breed keys in the test_ids for later processing
self.test_ids['breed'] += [int(row['breed_id'])]
breed_id = self.make_breed_id(row['breed_id'])
self.id_hash['breed'][row['breed_id']] = breed_id
tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
breed_label = row['breed_name']
species_label = self.label_hash.get(tax_id)
if species_label is not None:
breed_label = breed_label + ' ('+species_label+')'
self.gu.addIndividualToGraph(self.g, breed_id, breed_label, tax_id)
self.label_hash[breed_id] = breed_label
return
def _process_phene_row(self, row):
示例10: _get_equivids
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
def _get_equivids(self, limit):
"""
The file processed here is of the format:
#NBK_id GR_shortname OMIM
NBK1103 trimethylaminuria 136132
NBK1103 trimethylaminuria 602079
NBK1104 cdls 122470
Where each of the rows represents a mapping between
a gr id and an omim id. These are a 1:many relationship,
and some of the omim ids are genes(not diseases).
Therefore, we need to create a loose coupling here.
We make the assumption that these NBKs are generally higher-level
grouping classes; therefore the OMIM ids are treated as subclasses.
(This assumption is poor for those omims that are actually genes,
but we have no way of knowing what those are here...
we will just have to deal with that for now.)
:param limit:
:return:
"""
raw = '/'.join((self.rawdir, self.files['idmap']['file']))
gu = GraphUtils(curie_map.get())
line_counter = 0
# we look some stuff up in OMIM, so initialize here
omim = OMIM()
id_map = {}
allomimids = set()
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
if line_counter == 1: # skip header
continue
(nbk_num, shortname, omim_num) = row
gr_id = 'GeneReviews:'+nbk_num
omim_id = 'OMIM:'+omim_num
if not (
(self.testMode and
len(self.test_ids) > 0 and
omim_id in self.test_ids) or not
self.testMode):
continue
# sometimes there's bad omim nums
if len(omim_num) > 6:
logger.warning(
"OMIM number incorrectly formatted " +
"in row %d; skipping:\n%s",
line_counter, '\t'.join(row))
continue
# build up a hashmap of the mappings; then process later
if nbk_num not in id_map:
id_map[nbk_num] = set()
id_map[nbk_num].add(omim_num)
# add the class along with the shortname
gu.addClassToGraph(self.graph, gr_id, None)
gu.addSynonym(self.graph, gr_id, shortname)
allomimids.add(omim_num)
if not self.testMode and \
limit is not None and line_counter > limit:
break
# end looping through file
# get the omim ids that are not genes
entries_that_are_phenotypes = \
omim.process_entries(
list(allomimids), filter_keep_phenotype_entry_ids,
None, None, limit)
logger.info("Filtered out %d/%d entries that are genes or features",
len(allomimids)-len(entries_that_are_phenotypes),
len(allomimids))
for nbk_num in self.book_ids:
gr_id = 'GeneReviews:'+nbk_num
if nbk_num in id_map:
omim_ids = id_map.get(nbk_num)
for omim_num in omim_ids:
omim_id = 'OMIM:'+omim_num
# add the gene reviews as a superclass to the omim id,
# but only if the omim id is not a gene
if omim_id in entries_that_are_phenotypes:
gu.addClassToGraph(self.graph, omim_id, None)
gu.addSubclass(self.graph, gr_id, omim_id)
# add this as a generic subclass of DOID:4
gu.addSubclass(self.graph, 'DOID:4', gr_id)
return
示例11: UCSCBands
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
#.........这里部分代码省略.........
self.load_bindings()
# using the full graph as the test here
self.testgraph = self.graph
logger.info("Found %d nodes", len(self.graph))
logger.info("Done parsing files.")
return
def _get_chrbands(self, limit, taxon):
"""
:param limit:
:return:
"""
# TODO PYLINT figure out what limit was for and why it is unused
line_counter = 0
myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
logger.info("Processing Chr bands from FILE: %s", myfile)
geno = Genotype(self.graph)
monochrom = Monochrom()
# used to hold band definitions for a chr
# in order to compute extent of encompasing bands
mybands = {}
# build the organism's genome from the taxon
genome_label = self.files[taxon]['genome_label']
taxon_id = 'NCBITaxon:'+taxon
# add the taxon as a class. adding the class label elsewhere
self.gu.addClassToGraph(self.graph, taxon_id, None)
self.gu.addSynonym(self.graph, taxon_id, genome_label)
self.gu.loadObjectProperties(self.graph, Feature.object_properties)
self.gu.loadProperties(self.graph, Feature.data_properties,
self.gu.DATAPROP)
self.gu.loadAllProperties(self.graph)
geno.addGenome(taxon_id, genome_label)
# add the build and the taxon it's in
build_num = self.files[taxon]['build_num']
build_id = 'UCSC:'+build_num
geno.addReferenceGenome(build_id, build_num, taxon_id)
# process the bands
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match('^#', line):
continue
# chr13 4500000 10000000 p12 stalk
(scaffold, start, stop, band_num, rtype) = line.split('\t')
line_counter += 1
# NOTE some less-finished genomes have
# placed and unplaced scaffolds
# * Placed scaffolds:
# the scaffolds have been placed within a chromosome.
# * Unlocalized scaffolds:
# although the chromosome within which the scaffold occurs
# is known, the scaffold's position or orientation
示例12: _get_variants
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
#.........这里部分代码省略.........
chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')
seqalt_id = ':'.join(('ClinVarVariant', variant_num))
gene_id = None
if str(gene_num) != '-1' and str(gene_num) != 'more than 10': # they use -1 to indicate unknown gene
gene_id = ':'.join(('NCBIGene', str(gene_num)))
# FIXME there are some "variants" that are actually haplotypes
# probably will get taken care of when we switch to processing the xml
# for example, variant_num = 38562
# but there's no way to tell if it's a haplotype in the csv data
# so the dbsnp or dbvar should probably be primary, and the variant num be the vslc,
# with each of the dbsnps being added to it
# todo clinical significance needs to be mapped to a list of terms
# first, make the variant:
f = Feature(seqalt_id, allele_name, allele_type_id)
if start != '-' and start.strip() != '':
f.addFeatureStartLocation(start, chrinbuild_id)
if stop != '-' and stop.strip() != '':
f.addFeatureEndLocation(stop, chrinbuild_id)
f.addFeatureToGraph(g)
if bandinbuild_id is not None:
f.addSubsequenceOfFeature(g, bandinbuild_id)
# CHECK - this makes the assumption that there is only one affected chromosome per variant
# what happens with chromosomal rearrangement variants? shouldn't both chromosomes be here?
# add the hgvs as synonyms
if hgvs_c != '-' and hgvs_c.strip() != '':
gu.addSynonym(g, seqalt_id, hgvs_c)
if hgvs_p != '-' and hgvs_p.strip() != '':
gu.addSynonym(g, seqalt_id, hgvs_p)
# add the dbsnp and dbvar ids as equivalent
if dbsnp_num != '-' and int(dbsnp_num) != -1:
dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
gu.addIndividualToGraph(g, dbsnp_id, None)
gu.addSameIndividual(g, seqalt_id, dbsnp_id)
if dbvar_num != '-':
dbvar_id = 'dbVar:'+dbvar_num
gu.addIndividualToGraph(g, dbvar_id, None)
gu.addSameIndividual(g, seqalt_id, dbvar_id)
# TODO - not sure if this is right... add as xref?
# the rcv is like the combo of the phenotype with the variant
if rcv_nums != '-':
for rcv_num in re.split(';',rcv_nums):
rcv_id = 'ClinVar:'+rcv_num
gu.addIndividualToGraph(g, rcv_id, None)
gu.addXref(g, seqalt_id, rcv_id)
if gene_id is not None:
# add the gene
gu.addClassToGraph(g, gene_id, gene_symbol)
# make a variant locus
vl_id = '_'+gene_num+'-'+variant_num
if self.nobnodes:
vl_id = ':'+vl_id
vl_label = allele_name
gu.addIndividualToGraph(g, vl_id, vl_label, geno.genoparts['variant_locus'])
geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
geno.addAlleleOfGene(vl_id, gene_id)
示例13: process_gaf
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
def process_gaf(self, file, limit, id_map=None):
if self.testMode:
g = self.testgraph
else:
g = self.graph
gu = GraphUtils(curie_map.get())
geno = Genotype(g)
logger.info("Processing Gene Associations from %s", file)
line_counter = 0
zfin = wbase = None
if 7955 in self.tax_ids:
zfin = ZFIN()
elif 6239 in self.tax_ids:
wbase = WormBase()
with gzip.open(file, 'rb') as csvfile:
filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
# comments start with exclamation
if re.match(r'!', ''.join(row)):
continue
(db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
with_or_from, aspect, gene_name, gene_synonym, object_type,
taxon, date, assigned_by, annotation_extension,
gene_product_form_id) = row
# test for required fields
if (db == '' or gene_num == '' or gene_symbol == '' or
go_id == '' or ref == '' or eco_symbol == '' or
aspect == '' or object_type == '' or taxon == '' or
date == '' or assigned_by == ''):
logger.error(
"Missing required part of annotation " +
"on row %d:\n"+'\t'.join(row),
line_counter)
continue
# deal with qualifier NOT, contributes_to, colocalizes_with
if re.search(r'NOT', qualifier):
continue
db = self.clean_db_prefix(db)
uniprotid = None
gene_id = None
if db == 'UniProtKB':
mapped_ids = id_map.get(gene_num)
if id_map is not None and mapped_ids is not None:
if len(mapped_ids) == 1:
gene_id = mapped_ids[0]
uniprotid = ':'.join((db, gene_num))
gene_num = re.sub(r'\w+\:', '', gene_id)
elif len(mapped_ids) > 1:
# logger.warning(
# "Skipping gene id mapped for >1 gene %s -> %s",
# gene_num, str(mapped_ids))
continue
else:
continue
elif db == 'MGI':
gene_num = re.sub(r'MGI:', '', gene_num)
gene_id = ':'.join((db, gene_num))
gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
else:
gene_id = ':'.join((db, gene_num))
if self.testMode \
and not(
re.match(r'NCBIGene', gene_id) and
int(gene_num) in self.test_ids):
continue
gu.addClassToGraph(g, gene_id, gene_symbol)
if gene_name != '':
gu.addDescription(g, gene_id, gene_name)
if gene_synonym != '':
for s in re.split(r'\|', gene_synonym):
gu.addSynonym(g, gene_id, s.strip())
if re.search(r'\|', taxon):
# TODO add annotations with >1 taxon
logger.info(">1 taxon (%s) on line %d. skipping", taxon,
line_counter)
else:
tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
geno.addTaxon(tax_id, gene_id)
assoc = Assoc(self.name)
assoc.set_subject(gene_id)
assoc.set_object(go_id)
eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
if eco_id is not None:
assoc.add_evidence(eco_id)
refs = re.split(r'\|', ref)
#.........这里部分代码省略.........
示例14: _process_nlx_157874_1_view
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
def _process_nlx_157874_1_view(self, raw, limit=None):
"""
This table contains the Elements of Morphology data that has been
screen-scraped into DISCO.
Note that foaf:depiction is inverse of foaf:depicts relationship.
Since it is bad form to have two definitions,
we concatenate the two into one string.
Triples:
<eom id> a owl:Class
rdf:label Literal(eom label)
OIO:hasRelatedSynonym Literal(synonym list)
IAO:definition Literal(objective_def. subjective def)
foaf:depiction Literal(small_image_url),
Literal(large_image_url)
foaf:page Literal(page_url)
rdfs:comment Literal(long commented text)
:param raw:
:param limit:
:return:
"""
gu = GraphUtils(curie_map.get())
line_counter = 0
with open(raw, 'r') as f1:
f1.readline() # read the header row; skip
filereader = csv.reader(f1, delimiter='\t', quotechar='\"')
for line in filereader:
line_counter += 1
(morphology_term_id, morphology_term_num,
morphology_term_label, morphology_term_url,
terminology_category_label, terminology_category_url,
subcategory, objective_definition, subjective_definition,
comments, synonyms, replaces, small_figure_url,
large_figure_url, e_uid, v_uid, v_uuid,
v_last_modified) = line
# note:
# e_uid v_uuid v_last_modified terminology_category_url
# subcategory v_uid morphology_term_num
# terminology_category_label hp_label notes
# are currently unused.
# Add morphology term to graph as a class
# with label, type, and description.
gu.addClassToGraph(self.graph, morphology_term_id,
morphology_term_label)
# Assemble the description text
if subjective_definition != '' and not (
re.match(r'.+\.$', subjective_definition)):
# add a trailing period.
subjective_definition = subjective_definition.strip() + '.'
if objective_definition != '' and not (
re.match(r'.+\.$', objective_definition)):
# add a trailing period.
objective_definition = objective_definition.strip() + '.'
definition = \
' '.join(
(objective_definition, subjective_definition)).strip()
gu.addDefinition(self.graph, morphology_term_id, definition)
# <term id> FOAF:depicted_by literal url
# <url> type foaf:depiction
# do we want both images?
# morphology_term_id has depiction small_figure_url
if small_figure_url != '':
gu.addDepiction(self.graph, morphology_term_id,
small_figure_url)
# morphology_term_id has depiction large_figure_url
if large_figure_url != '':
gu.addDepiction(self.graph, morphology_term_id,
large_figure_url)
# morphology_term_id has comment comments
if comments != '':
gu.addComment(self.graph, morphology_term_id,
comments.strip())
if synonyms != '':
for s in synonyms.split(';'):
gu.addSynonym(
self.graph, morphology_term_id, s.strip(),
gu.properties['hasExactSynonym'])
# morphology_term_id hasRelatedSynonym replaces (; delimited)
if replaces != '' and replaces != synonyms:
for s in replaces.split(';'):
gu.addSynonym(
self.graph, morphology_term_id, s.strip(),
gu.properties['hasRelatedSynonym'])
#.........这里部分代码省略.........
示例15: _process_ortholog_classes
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addSynonym [as 别名]
def _process_ortholog_classes(self, limit=None):
"""
This method add the KEGG orthology classes to the graph.
If there's an embedded enzyme commission number,
that is added as an xref.
Triples created:
<orthology_class_id> is a class
<orthology_class_id> has label <orthology_symbols>
<orthology_class_id> has description <orthology_description>
:param limit:
:return:
"""
logger.info("Processing ortholog classes")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
gu = GraphUtils(curie_map.get())
raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(orthology_class_id, orthology_class_name) = row
if self.testMode and \
orthology_class_id not in \
self.test_ids['orthology_classes']:
continue
# The orthology class is essentially a KEGG gene ID
# that is species agnostic.
# Add the ID and label as a gene family class
other_labels = re.split(r'[;,]', orthology_class_name)
# the first one is the label we'll use
orthology_label = other_labels[0]
orthology_class_id = 'KEGG-'+orthology_class_id.strip()
orthology_type = OrthologyAssoc.terms['gene_family']
gu.addClassToGraph(g, orthology_class_id, orthology_label,
orthology_type)
if len(other_labels) > 1:
# add the rest as synonyms
# todo skip the first
for s in other_labels:
gu.addSynonym(g, orthology_class_id, s.strip())
# add the last one as the description
d = other_labels[len(other_labels)-1]
gu.addDescription(g, orthology_class_id, d)
# add the enzyme commission number (EC:1.2.99.5)as an xref
# sometimes there's two, like [EC:1.3.5.1 1.3.5.4]
# can also have a dash, like EC:1.10.3.-
ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d)
if ec_matches is not None:
for ecm in ec_matches:
gu.addXref(g, orthology_class_id, 'EC:'+ecm)
if not self.testMode and \
limit is not None and line_counter > limit:
break
logger.info("Done with ortholog classes")
return