本文整理汇总了Python中dipper.models.Model.Model.addSynonym方法的典型用法代码示例。如果您正苦于以下问题:Python Model.addSynonym方法的具体用法?Python Model.addSynonym怎么用?Python Model.addSynonym使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.models.Model.Model
的用法示例。
在下文中一共展示了Model.addSynonym方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _process_straininfo
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _process_straininfo(self, limit):
# line_counter = 0 # TODO unused
if self.testMode:
g = self.testgraph
else:
g = self.graph
model = Model(g)
logger.info("Processing measurements ...")
raw = '/'.join((self.rawdir, self.files['straininfo']['file']))
tax_id = 'NCBITaxon:10090'
with open(raw, 'r') as f:
reader = csv.reader(f, delimiter=',', quotechar='\"')
self.check_header(self.files['straininfo']['file'], f.readline())
for row in reader:
(strain_name, vendor, stocknum, panel, mpd_strainid,
straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
# C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
# create the strain as an instance of the taxon
if self.testMode and \
'MPD:' + str(mpd_strainid) not in self.test_ids:
continue
strain_id = 'MPD-strain:' + str(mpd_strainid)
model.addIndividualToGraph(strain_id, strain_name, tax_id)
if mpdshortname.strip() != '':
model.addSynonym(strain_id, mpdshortname.strip())
self.idlabel_hash[strain_id] = strain_name
# make it equivalent to the vendor+stock
if stocknum != '':
if vendor == 'J':
jax_id = 'JAX:'+stocknum
model.addSameIndividual(strain_id, jax_id)
elif vendor == 'Rbrc':
# reiken
reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
model.addSameIndividual(strain_id, reiken_id)
else:
if url != '':
model.addXref(strain_id, url, True)
if vendor != '':
model.addXref(
strain_id, ':'.join((vendor, stocknum)),
True)
# add the panel information
if panel != '':
desc = panel+' [panel]'
model.addDescription(strain_id, desc)
# TODO make the panels as a resource collection
return
示例2: process_gene_ids
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def process_gene_ids(self, limit):
raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))
if self.testMode:
g = self.testgraph
else:
g = self.graph
model = Model(g)
logger.info("Processing: %s", self.files['gene_ids']['file'])
line_counter = 0
geno = Genotype(g)
with gzip.open(raw, 'rb') as csvfile:
filereader = csv.reader(
io.TextIOWrapper(csvfile, newline=""), delimiter=',',
quotechar='\"')
for row in filereader:
line_counter += 1
(taxon_num,
gene_num,
gene_symbol,
gene_synonym,
live,
gene_type) = row
# 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene
if self.testMode and gene_num not in self.test_ids['gene']:
continue
taxon_id = 'NCBITaxon:'+taxon_num
gene_id = 'WormBase:'+gene_num
if gene_symbol == '':
gene_symbol = gene_synonym
if gene_symbol == '':
gene_symbol = None
model.addClassToGraph(
gene_id, gene_symbol, Genotype.genoparts['gene'])
if live == 'Dead':
model.addDeprecatedClass(gene_id)
geno.addTaxon(taxon_id, gene_id)
if gene_synonym != '' and gene_synonym is not None:
model.addSynonym(gene_id, gene_synonym)
if not self.testMode \
and limit is not None and line_counter > limit:
break
return
示例3: _get_titles
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _get_titles(self, limit):
"""
The file processed here is of the format:
#NBK_id GR_shortname OMIM
NBK1103 trimethylaminuria 136132
NBK1103 trimethylaminuria 602079
NBK1104 cdls 122470
Where each of the rows represents a mapping between
a gr id and an omim id. These are a 1:many relationship,
and some of the omim ids are genes (not diseases).
Therefore, we need to create a loose coupling here.
We make the assumption that these NBKs are generally higher-level
grouping classes; therefore the OMIM ids are treated as subclasses.
(This assumption is poor for those omims that are actually genes,
but we have no way of knowing what those are here...
we will just have to deal with that for now.)
:param limit:
:return:
"""
raw = '/'.join((self.rawdir, self.files['titles']['file']))
model = Model(self.graph)
line_counter = 0
with open(raw, 'r', encoding='latin-1') as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
header = next(filereader)
line_counter = 1
colcount = len(header)
if colcount != 4: # ('GR_shortname', 'GR_Title', 'NBK_id', 'PMID')
logger.error("Unexpected Header ", header)
exit(-1)
for row in filereader:
line_counter += 1
if len(row) != colcount:
logger.error("Unexpected row. got: ", row)
logger.error("Expected data for: ", header)
exit(-1)
(shortname, title, nbk_num, pmid) = row
gr_id = 'GeneReviews:'+nbk_num
self.book_ids.add(nbk_num) # a global set of the book nums
if limit is None or line_counter < limit:
model.addClassToGraph(gr_id, title)
model.addSynonym(gr_id, shortname)
# TODO include the new PMID?
return
示例4: _get_titles
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _get_titles(self, limit):
"""
The file processed here is of the format:
#NBK_id GR_shortname OMIM
NBK1103 trimethylaminuria 136132
NBK1103 trimethylaminuria 602079
NBK1104 cdls 122470
Where each of the rows represents a mapping between
a gr id and an omim id. These are a 1:many relationship,
and some of the omim ids are genes (not diseases).
Therefore, we need to create a loose coupling here.
We make the assumption that these NBKs are generally higher-level
grouping classes; therefore the OMIM ids are treated as subclasses.
(This assumption is poor for those omims that are actually genes,
but we have no way of knowing what those are here...
we will just have to deal with that for now.)
:param limit:
:return:
"""
raw = '/'.join((self.rawdir, self.files['titles']['file']))
model = Model(self.graph)
col = ['GR_shortname', 'GR_Title', 'NBK_id', 'PMID']
with open(raw, 'r', encoding='latin-1') as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
row = next(filereader)
row[0] = row[0][1:]
colcount = len(col)
if not self.check_fileheader(col, row):
exit(-1)
for row in filereader:
if len(row) != colcount:
LOG.error("Unexpected row. got: %s", row)
LOG.error("Expected data for: %s", col)
exit(-1)
nbk_num = row[col.index('NBK_id')]
gr_id = 'GeneReviews:' + nbk_num
self.book_ids.add(nbk_num) # a global set of the book nums
if limit is None or filereader.line_num < limit:
model.addClassToGraph(gr_id, row[col.index('GR_Title')])
model.addSynonym(gr_id, row[col.index('GR_shortname')])
示例5: _transform_entry
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _transform_entry(self, e, graph):
g = graph
model = Model(g)
geno = Genotype(graph)
tax_num = '9606'
tax_id = 'NCBITaxon:9606'
tax_label = 'Human'
build_num = "GRCh38"
build_id = "NCBIGenome:"+build_num
# get the numbers, labels, and descriptions
omimnum = e['entry']['mimNumber']
titles = e['entry']['titles']
label = titles['preferredTitle']
other_labels = []
if 'alternativeTitles' in titles:
other_labels += self._get_alt_labels(titles['alternativeTitles'])
if 'includedTitles' in titles:
other_labels += self._get_alt_labels(titles['includedTitles'])
# add synonyms of alternate labels
# preferredTitle": "PFEIFFER SYNDROME",
# "alternativeTitles":
# "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME",
# "includedTitles":
# "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED"
# remove the abbreviation (comes after the ;) from the preferredTitle,
# and add it as a synonym
abbrev = None
if len(re.split(r';', label)) > 1:
abbrev = (re.split(r';', label)[1].strip())
newlabel = self._cleanup_label(label)
description = self._get_description(e['entry'])
omimid = 'OMIM:'+str(omimnum)
if e['entry']['status'] == 'removed':
model.addDeprecatedClass(omimid)
else:
omimtype = self._get_omimtype(e['entry'])
nodelabel = newlabel
# this uses our cleaned-up label
if omimtype == Genotype.genoparts['heritable_phenotypic_marker']:
if abbrev is not None:
nodelabel = abbrev
# in this special case,
# make it a disease by not declaring it as a gene/marker
model.addClassToGraph(omimid, nodelabel, None, newlabel)
elif omimtype == Genotype.genoparts['gene']:
if abbrev is not None:
nodelabel = abbrev
model.addClassToGraph(omimid, nodelabel, omimtype, newlabel)
else:
model.addClassToGraph(omimid, newlabel, omimtype)
# add the original screaming-caps OMIM label as a synonym
model.addSynonym(omimid, label)
# add the alternate labels and includes as synonyms
for l in other_labels:
model.addSynonym(omimid, l, 'OIO:hasRelatedSynonym')
# for OMIM, we're adding the description as a definition
model.addDefinition(omimid, description)
if abbrev is not None:
model.addSynonym(omimid, abbrev, 'OIO:hasRelatedSynonym')
# if this is a genetic locus (but not sequenced)
# then add the chrom loc info
# but add it to the ncbi gene identifier,
# not to the omim id (we reserve the omim id to be the phenotype)
feature_id = None
feature_label = None
if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']:
genemap = e['entry']['geneMap']
is_gene = False
if omimtype == \
Genotype.genoparts['heritable_phenotypic_marker']:
# get the ncbigene ids
ncbifeature = self._get_mapped_gene_ids(e['entry'], g)
if len(ncbifeature) == 1:
feature_id = 'NCBIGene:'+str(ncbifeature[0])
# add this feature as a cause for the omim disease
# TODO SHOULD I EVEN DO THIS HERE?
assoc = G2PAssoc(g, self.name, feature_id, omimid)
assoc.add_association_to_graph()
elif len(ncbifeature) > 1:
logger.info(
"Its ambiguous when %s maps to >1 gene id: %s",
omimid, str(ncbifeature))
else: # no ncbi feature, make an anonymous one
feature_id = self._make_anonymous_feature(str(omimnum))
feature_label = abbrev
elif omimtype == Genotype.genoparts['gene']:
#.........这里部分代码省略.........
示例6: _process_diseasegene
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _process_diseasegene(self, limit):
"""
:param limit:
:return:
"""
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
line_counter = 0
model = Model(graph)
myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))
for event, elem in ET.iterparse(myfile):
if elem.tag == 'Disorder':
# get the element name and id, ignore element name
# id = elem.get('id') # some internal identifier
disorder_num = elem.find('OrphaNumber').text
disorder_id = 'ORPHA:' + str(disorder_num)
if self.test_mode and disorder_id not in self.all_test_ids['disease']:
continue
disorder_label = elem.find('Name').text
# assuming that these are in the ontology (...any particular one?)
model.addClassToGraph(disorder_id, disorder_label)
assoc_list = elem.find('DisorderGeneAssociationList')
expected_genes = assoc_list.get('count')
LOG.info(
'Expecting %s genes associated with disorder %s.',
expected_genes, disorder_id)
processed_genes = 0
for assoc in assoc_list.findall('DisorderGeneAssociation'):
processed_genes += 1
gene = assoc.find('Gene')
# get gene's curie HGNC or Ensembl ...
lclid = gene.find('OrphaNumber').text
gene_curie = 'ORPHA:' + lclid
gene_set = {'ORPHA': lclid}
for gene_ref in gene.findall(
'./ExternalReferenceList/ExternalReference'):
gene_set[gene_ref.find('Source').text] = \
gene_ref.find('Reference').text
# set priority (clique leader if available) but default to OPRHA
for pfx in ('HGNC', 'Ensembl', 'SwissProt'):
if pfx in gene_set:
if pfx in self.localtt:
pfx = self.localtt[pfx]
gene_curie = pfx + ':' + gene_set[pfx]
gene_set.pop(pfx)
model.addClassToGraph(gene_curie, None)
break
# TEC have reservations w.r.t aggerator links being gene classes
for prefix in gene_set:
lclid = gene_set[prefix]
if prefix in self.localtt:
prefix = self.localtt[prefix]
dbxref = prefix + ':' + lclid
if gene_curie != dbxref:
model.addClassToGraph(dbxref, None)
model.addEquivalentClass(gene_curie, dbxref)
# TEC. would prefer this not happen here. let HGNC handle it
# except there are some w/o explicit external links ...
gene_symbol = gene.find('Symbol').text
syn_list = gene.find('./SynonymList')
if int(syn_list.get('count')) > 0:
for syn in syn_list.findall('./Synonym'):
model.addSynonym(gene_curie, syn.text)
dg_label = assoc.find('./DisorderGeneAssociationType/Name').text
# use dg association status to issue an evidence code
# FIXME I think that these codes are sub-optimal
eco_id = self.resolve(
assoc.find('DisorderGeneAssociationStatus/Name').text)
rel_id = self.resolve(dg_label)
g2p_assoc = G2PAssoc(self.graph, self.name, gene_curie, disorder_id, rel_id)
g2p_assoc.add_evidence(eco_id)
g2p_assoc.add_association_to_graph()
elem.clear() # empty the element
if int(expected_genes) != processed_genes:
LOG.warning(
'% expected %s associated genes but we processed %i',
disorder_id, expected_genes, processed_genes)
if self.test_mode and limit is not None and line_counter > limit:
#.........这里部分代码省略.........
示例7: _process_nlx_157874_1_view
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _process_nlx_157874_1_view(self, raw, limit=None):
"""
This table contains the Elements of Morphology data that has been
screen-scraped into DISCO.
Note that foaf:depiction is inverse of foaf:depicts relationship.
Since it is bad form to have two definitions,
we concatenate the two into one string.
Triples:
<eom id> a owl:Class
rdf:label Literal(eom label)
OIO:hasRelatedSynonym Literal(synonym list)
IAO:definition Literal(objective_def. subjective def)
foaf:depiction Literal(small_image_url),
Literal(large_image_url)
foaf:page Literal(page_url)
rdfs:comment Literal(long commented text)
:param raw:
:param limit:
:return:
"""
model = Model(self.graph)
line_counter = 0
with open(raw, 'r') as f1:
f1.readline() # read the header row; skip
filereader = csv.reader(f1, delimiter='\t', quotechar='\"')
for line in filereader:
line_counter += 1
(morphology_term_id, morphology_term_num,
morphology_term_label, morphology_term_url,
terminology_category_label, terminology_category_url,
subcategory, objective_definition, subjective_definition,
comments, synonyms, replaces, small_figure_url,
large_figure_url, e_uid, v_uid, v_uuid,
v_last_modified, v_status, v_lastmodified_epoch) = line
# note:
# e_uid v_uuid v_last_modified terminology_category_url
# subcategory v_uid morphology_term_num
# terminology_category_label hp_label notes
# are currently unused.
# Add morphology term to graph as a class
# with label, type, and description.
model.addClassToGraph(morphology_term_id,
morphology_term_label)
# Assemble the description text
if subjective_definition != '' and not (
re.match(r'.+\.$', subjective_definition)):
# add a trailing period.
subjective_definition = subjective_definition.strip() + '.'
if objective_definition != '' and not (
re.match(r'.+\.$', objective_definition)):
# add a trailing period.
objective_definition = objective_definition.strip() + '.'
definition = \
' '.join(
(objective_definition, subjective_definition)).strip()
model.addDefinition(morphology_term_id, definition)
# <term id> FOAF:depicted_by literal url
# <url> type foaf:depiction
# do we want both images?
# morphology_term_id has depiction small_figure_url
if small_figure_url != '':
model.addDepiction(morphology_term_id,
small_figure_url)
# morphology_term_id has depiction large_figure_url
if large_figure_url != '':
model.addDepiction(morphology_term_id,
large_figure_url)
# morphology_term_id has comment comments
if comments != '':
model.addComment(morphology_term_id,
comments.strip())
if synonyms != '':
for s in synonyms.split(';'):
model.addSynonym(
morphology_term_id, s.strip(),
model.annotation_properties['hasExactSynonym'])
# morphology_term_id hasRelatedSynonym replaces (; delimited)
if replaces != '' and replaces != synonyms:
for s in replaces.split(';'):
model.addSynonym(
morphology_term_id, s.strip(),
model.annotation_properties['hasRelatedSynonym'])
#.........这里部分代码省略.........
示例8: _get_chrbands
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _get_chrbands(self, limit, taxon):
"""
:param limit:
:return:
"""
model = Model(self.graph)
# TODO PYLINT figure out what limit was for and why it is unused
line_counter = 0
myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
logger.info("Processing Chr bands from FILE: %s", myfile)
geno = Genotype(self.graph)
monochrom = Monochrom(self.graph_type, self.are_bnodes_skized)
# used to hold band definitions for a chr
# in order to compute extent of encompasing bands
mybands = {}
# build the organism's genome from the taxon
genome_label = self.files[taxon]['genome_label']
taxon_id = 'NCBITaxon:'+taxon
# add the taxon as a class. adding the class label elsewhere
model.addClassToGraph(taxon_id, None)
model.addSynonym(taxon_id, genome_label)
geno.addGenome(taxon_id, genome_label)
# add the build and the taxon it's in
build_num = self.files[taxon]['build_num']
build_id = 'UCSC:'+build_num
geno.addReferenceGenome(build_id, build_num, taxon_id)
# process the bands
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match('^#', line):
continue
# chr13 4500000 10000000 p12 stalk
(scaffold, start, stop, band_num, rtype) = line.split('\t')
line_counter += 1
# NOTE some less-finished genomes have
# placed and unplaced scaffolds
# * Placed scaffolds:
# the scaffolds have been placed within a chromosome.
# * Unlocalized scaffolds:
# although the chromosome within which the scaffold occurs
# is known, the scaffold's position or orientation
# is not known.
# * Unplaced scaffolds:
# it is not known which chromosome the scaffold belongs to
#
# find out if the thing is a full on chromosome, or a scaffold:
# ex: unlocalized scaffold: chr10_KL568008v1_random
# ex: unplaced scaffold: chrUn_AABR07022428v1
placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))'
unlocalized_scaffold_pattern = \
placed_scaffold_pattern+r'_(\w+)_random'
unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)'
m = re.match(placed_scaffold_pattern+r'$', scaffold)
if m is not None and len(m.groups()) == 1:
# the chromosome is the first match of the pattern
chrom_num = m.group(1)
else:
# skip over anything that isn't a placed_scaffold
# at the class level
logger.info("Found non-placed chromosome %s", scaffold)
chrom_num = None
m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold)
m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold)
scaffold_num = None
if m:
pass
elif m_chr_unloc is not None and\
len(m_chr_unloc.groups()) == 2:
chrom_num = m_chr_unloc.group(1)
scaffold_num = chrom_num+'_'+m_chr_unloc.group(2)
elif m_chr_unplaced is not None and\
len(m_chr_unplaced.groups()) == 1:
scaffold_num = m_chr_unplaced.group(1)
else:
logger.error(
"There's a chr pattern that we aren't matching: %s",
scaffold)
if chrom_num is not None:
# the chrom class (generic) id
chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')
# first, add the chromosome class (in the taxon)
geno.addChromosomeClass(
chrom_num, taxon_id, self.files[taxon]['genome_label'])
#.........这里部分代码省略.........
示例9: process_feature_loc
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
#.........这里部分代码省略.........
fid = 'WormBase:'+attribute_dict.get('variation')
flabel = attribute_dict.get('public_name')
sub = attribute_dict.get('substitution')
ins = attribute_dict.get('insertion')
# if it's a variation:
# variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T
desc = ''
if sub is not None:
desc = 'substitution='+sub
if ins is not None:
desc = 'insertion='+ins
# keep track of the strains with this variation,
# for later processing
strain_list = attribute_dict.get('strain')
if strain_list is not None:
for s in re.split(r',', strain_list):
if s.strip() not in strain_to_variant_map:
strain_to_variant_map[s.strip()] = set()
strain_to_variant_map[s.strip()].add(fid)
# if feature_type_label == 'RNAi_reagent':
# Target=WBRNAi00096030 1 4942
# this will tell us where the RNAi is actually binding
# target = attribute_dict.get('Target') # TODO unused
# rnai_num = re.split(r' ', target)[0] # TODO unused
# it will be the reagent-targeted-gene that has a position,
# (i think)
# TODO finish the RNAi binding location
name = attribute_dict.get('Name')
polymorphism = attribute_dict.get('polymorphism')
if fid is None:
if name is not None and re.match(r'WBsf', name):
fid = 'WormBase:'+name
name = None
else:
continue
if self.testMode \
and re.sub(r'WormBase:', '', fid) \
not in self.test_ids['gene']+self.test_ids['allele']:
continue
# these really aren't that interesting
if polymorphism is not None:
continue
if name is not None and not re.search(name, fid):
if flabel is None:
flabel = name
else:
model.addSynonym(fid, name)
if desc is not None:
model.addDescription(fid, desc)
alias = attribute_dict.get('Alias')
biotype = attribute_dict.get('biotype')
note = attribute_dict.get('Note')
other_name = attribute_dict.get('other_name')
for n in [alias, other_name]:
if n is not None:
model.addSynonym(fid, other_name)
ftype = self.get_feature_type_by_class_and_biotype(
feature_type_label, biotype)
chr_id = makeChromID(chrom, build_id, 'CHR')
geno.addChromosomeInstance(chrom, build_id, build_num)
feature = Feature(g, fid, flabel, ftype)
feature.addFeatureStartLocation(start, chr_id, strand)
feature.addFeatureEndLocation(start, chr_id, strand)
feature_is_class = False
if feature_type_label == 'gene':
feature_is_class = True
feature.addFeatureToGraph(True, None, feature_is_class)
if note is not None:
model.addDescription(fid, note)
if not self.testMode \
and limit is not None and line_counter > limit:
break
# RNAi reagents:
# I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10
# I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052
# I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH
# TODO TF bindiing sites and network:
# I TF_binding_site_region TF_binding_site 1861 2048 . + . Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16
# I TF_binding_site_region TF_binding_site 3403 4072 . + . Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1
return
示例10: _get_equivids
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _get_equivids(self, limit):
"""
The file processed here is of the format:
#NBK_id GR_shortname OMIM
NBK1103 trimethylaminuria 136132
NBK1103 trimethylaminuria 602079
NBK1104 cdls 122470
Where each of the rows represents a mapping between
a gr id and an omim id. These are a 1:many relationship,
and some of the omim ids are genes(not diseases).
Therefore, we need to create a loose coupling here.
We make the assumption that these NBKs are generally higher-level
grouping classes; therefore the OMIM ids are treated as subclasses.
(This assumption is poor for those omims that are actually genes,
but we have no way of knowing what those are here...
we will just have to deal with that for now.)
:param limit:
:return:
"""
raw = '/'.join((self.rawdir, self.files['idmap']['file']))
model = Model(self.graph)
line_counter = 0
# we look some stuff up in OMIM, so initialize here
omim = OMIM(self.graph_type, self.are_bnodes_skized)
id_map = {}
allomimids = set()
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
if line_counter == 1: # skip header
continue
(nbk_num, shortname, omim_num) = row
gr_id = 'GeneReviews:'+nbk_num
omim_id = 'OMIM:'+omim_num
if not (
(self.testMode and
len(self.test_ids) > 0 and
omim_id in self.test_ids) or not
self.testMode):
continue
# sometimes there's bad omim nums
if len(omim_num) > 6:
logger.warning(
"OMIM number incorrectly formatted " +
"in row %d; skipping:\n%s",
line_counter, '\t'.join(row))
continue
# build up a hashmap of the mappings; then process later
if nbk_num not in id_map:
id_map[nbk_num] = set()
id_map[nbk_num].add(omim_num)
# add the class along with the shortname
model.addClassToGraph(gr_id, None)
model.addSynonym(gr_id, shortname)
allomimids.add(omim_num)
if not self.testMode and \
limit is not None and line_counter > limit:
break
# end looping through file
# get the omim ids that are not genes
entries_that_are_phenotypes = \
omim.process_entries(
list(allomimids), filter_keep_phenotype_entry_ids,
None, None, limit)
logger.info("Filtered out %d/%d entries that are genes or features",
len(allomimids)-len(entries_that_are_phenotypes),
len(allomimids))
for nbk_num in self.book_ids:
gr_id = 'GeneReviews:'+nbk_num
if nbk_num in id_map:
omim_ids = id_map.get(nbk_num)
for omim_num in omim_ids:
omim_id = 'OMIM:'+omim_num
# add the gene reviews as a superclass to the omim id,
# but only if the omim id is not a gene
if omim_id in entries_that_are_phenotypes:
model.addClassToGraph(omim_id, None)
model.addSubClass(omim_id, gr_id)
# add this as a generic subclass of DOID:4
model.addSubClass(gr_id, 'DOID:4')
return
示例11: _get_gene_info
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _get_gene_info(self, limit):
"""
Currently loops through the gene_info file and
creates the genes as classes, typed with SO. It will add their label,
any alternate labels as synonyms, alternate ids as equivlaent classes.
HPRDs get added as protein products.
The chromosome and chr band get added as blank node regions,
and the gene is faldo:located
on the chr band.
:param limit:
:return:
"""
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
model = Model(g)
# not unzipping the file
logger.info("Processing 'Gene Info' records")
line_counter = 0
gene_info = '/'.join((self.rawdir, self.files['gene_info']['file']))
logger.info("FILE: %s", gene_info)
# Add taxa and genome classes for those in our filter
for tax_num in self.tax_ids:
tax_id = ':'.join(('NCBITaxon', str(tax_num)))
# tax label can get added elsewhere
geno.addGenome(tax_id, str(tax_num))
# label added elsewhere
model.addClassToGraph(tax_id, None)
with gzip.open(gene_info, 'rb') as f:
row = f.readline().decode().strip().split('\t')
logger.info("Header has %i columns", len(row))
for line in f:
# skip comments
line = line.decode().strip()
if re.match(r'^#', line):
continue
(tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom,
map_loc, desc, gtype, authority_symbol, name,
nomenclature_status, other_designations,
modification_date, feature_type) = line.split('\t')
# ##set filter=None in init if you don't want to have a filter
# if self.filter is not None:
# if ((self.filter == 'taxids' and \
# (int(tax_num) not in self.tax_ids))
# or (self.filter == 'geneids' and \
# (int(gene_num) not in self.gene_ids))):
# continue
# #### end filter
if self.testMode and int(gene_num) not in self.gene_ids:
continue
if not self.testMode and int(tax_num) not in self.tax_ids:
continue
line_counter += 1
gene_id = ':'.join(('NCBIGene', gene_num))
tax_id = ':'.join(('NCBITaxon', tax_num))
gene_type_id = self.map_type_of_gene(gtype.strip())
if symbol == 'NEWENTRY':
label = None
else:
label = symbol
# sequence feature, not a gene
if gene_type_id == 'SO:0000110':
self.class_or_indiv[gene_id] = 'I'
else:
self.class_or_indiv[gene_id] = 'C'
if not self.testMode and \
limit is not None and line_counter > limit:
continue
if self.class_or_indiv[gene_id] == 'C':
model.addClassToGraph(gene_id, label, gene_type_id, desc)
# NCBI will be the default leader,
# so we will not add the leader designation here.
else:
model.addIndividualToGraph(
gene_id, label, gene_type_id, desc)
# in this case, they aren't genes.
# so we want someone else to be the leader.
if name != '-':
model.addSynonym(gene_id, name)
if synonyms.strip() != '-':
for s in synonyms.split('|'):
model.addSynonym(
gene_id, s.strip(),
Assoc.annotation_properties['hasRelatedSynonym'])
if other_designations.strip() != '-':
for s in other_designations.split('|'):
#.........这里部分代码省略.........
示例12: _get_gene_history
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _get_gene_history(self, limit):
"""
Loops through the gene_history file and adds the old gene ids
as deprecated classes, where the new gene id is the replacement for it.
The old gene symbol is added as a synonym to the gene.
:param limit:
:return:
"""
if self.testMode:
g = self.testgraph
else:
g = self.graph
model = Model(g)
logger.info("Processing Gene records")
line_counter = 0
myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
logger.info("FILE: %s", myfile)
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match(r'^#', line):
continue
(tax_num, gene_num, discontinued_num, discontinued_symbol,
discontinued_date) = line.split('\t')
# set filter=None in init if you don't want to have a filter
# if self.filter is not None:
# if ((self.filter == 'taxids' and \
# (int(tax_num) not in self.tax_ids))
# or (self.filter == 'geneids' and \
# (int(gene_num) not in self.gene_ids))):
# continue
# end filter
if gene_num == '-' or discontinued_num == '-':
continue
if self.testMode and int(gene_num) not in self.gene_ids:
continue
if not self.testMode and int(tax_num) not in self.tax_ids:
continue
line_counter += 1
gene_id = ':'.join(('NCBIGene', gene_num))
discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
# add the two genes
if self.class_or_indiv.get(gene_id) == 'C':
model.addClassToGraph(gene_id, None)
model.addClassToGraph(
discontinued_gene_id, discontinued_symbol)
# add the new gene id to replace the old gene id
model.addDeprecatedClass(discontinued_gene_id, [gene_id])
else:
model.addIndividualToGraph(gene_id, None)
model.addIndividualToGraph(
discontinued_gene_id, discontinued_symbol)
model.addDeprecatedIndividual(
discontinued_gene_id, [gene_id])
# also add the old symbol as a synonym of the new gene
model.addSynonym(gene_id, discontinued_symbol)
if (not self.testMode) and\
(limit is not None and line_counter > limit):
break
return
示例13: _process_diseasegene
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _process_diseasegene(self, limit):
"""
:param limit:
:return:
"""
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
geno = Genotype(g)
model = Model(g)
myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))
# PYLINT complains iterparse deprecated,
# but as of py 3.4 only the optional & unsupplied parse arg is.
for event, elem in ET.iterparse(myfile):
if elem.tag == 'Disorder':
# get the element name and id, ignoreS element name
# id = elem.get('id') # some internal identifier
disorder_num = elem.find('OrphaNumber').text
disorder_id = 'Orphanet:'+str(disorder_num)
if self.testMode and \
disorder_id not in \
config.get_config()['test_ids']['disease']:
continue
disorder_label = elem.find('Name').text
# make a hash of internal gene id to type for later lookup
gene_iid_to_type = {}
gene_list = elem.find('GeneList')
for gene in gene_list.findall('Gene'):
gene_iid = gene.get('id')
gene_type = gene.find('GeneType').get('id')
gene_iid_to_type[gene_iid] = gene_type
# assuming that these are in the ontology
model.addClassToGraph(disorder_id, disorder_label)
assoc_list = elem.find('DisorderGeneAssociationList')
for a in assoc_list.findall('DisorderGeneAssociation'):
gene_iid = a.find('.//Gene').get('id')
gene_name = a.find('.//Gene/Name').text
gene_symbol = a.find('.//Gene/Symbol').text
gene_num = a.find('./Gene/OrphaNumber').text
gene_id = 'Orphanet:'+str(gene_num)
gene_type_id = \
self._map_gene_type_id(gene_iid_to_type[gene_iid])
model.addClassToGraph(
gene_id, gene_symbol, gene_type_id, gene_name)
syn_list = a.find('./Gene/SynonymList')
if int(syn_list.get('count')) > 0:
for s in syn_list.findall('./Synonym'):
model.addSynonym(gene_id, s.text)
dgtype = a.find('DisorderGeneAssociationType').get('id')
rel_id = self._map_rel_id(dgtype)
dg_label = \
a.find('./DisorderGeneAssociationType/Name').text
if rel_id is None:
logger.warning(
"Cannot map association type (%s) to RO " +
"for association (%s | %s). Skipping.",
dg_label, disorder_label, gene_symbol)
continue
alt_locus_id = '_:'+gene_num+'-'+disorder_num+'VL'
alt_label = \
' '.join(('some variant of', gene_symbol.strip(),
'that is a', dg_label.lower(),
disorder_label))
model.addIndividualToGraph(alt_locus_id, alt_label,
geno.genoparts['variant_locus'])
geno.addAffectedLocus(alt_locus_id, gene_id)
model.addBlankNodeAnnotation(alt_locus_id)
# consider typing the gain/loss-of-function variants like:
# http://sequenceontology.org/browser/current_svn/term/SO:0002054
# http://sequenceontology.org/browser/current_svn/term/SO:0002053
# use "assessed" status to issue an evidence code
# FIXME I think that these codes are sub-optimal
status_code = \
a.find('DisorderGeneAssociationStatus').get('id')
# imported automatically asserted information
# used in automatic assertion
eco_id = 'ECO:0000323'
# Assessed
# TODO are these internal ids stable between releases?
if status_code == '17991':
# imported manually asserted information
# used in automatic assertion
eco_id = 'ECO:0000322'
# Non-traceable author statement ECO_0000034
# imported information in automatic assertion ECO_0000313
#.........这里部分代码省略.........
示例14: _get_gene_history
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _get_gene_history(self, limit):
"""
Loops through the gene_history file and adds the old gene ids
as deprecated classes, where the new gene id is the replacement for it.
The old gene symbol is added as a synonym to the gene.
:param limit:
:return:
"""
src_key = 'gene_history'
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
model = Model(graph)
LOG.info("Processing Gene records")
line_counter = 0
myfile = '/'.join((self.rawdir, self.files[src_key]['file']))
LOG.info("FILE: %s", myfile)
col = self.files[src_key]['columns']
with gzip.open(myfile, 'rb') as tsv:
row = tsv.readline().decode().strip().split('\t')
row[0] = row[0][1:] # strip comment
if not self.check_fileheader(col, row):
pass
for line in tsv:
# skip comments
row = line.decode().strip().split('\t')
if row[0][0] == '#':
continue
tax_num = row[col.index('tax_id')].strip()
gene_num = row[col.index('GeneID')].strip()
discontinued_num = row[col.index('Discontinued_GeneID')].strip()
discontinued_symbol = row[col.index('Discontinued_Symbol')].strip()
# discontinued_date = row[col.index('Discontinue_Date')]
# set filter=None in init if you don't want to have a filter
# if self.id_filter is not None:
# if ((self.id_filter == 'taxids' and \
# (int(tax_num) not in self.tax_ids))
# or (self.id_filter == 'geneids' and \
# (int(gene_num) not in self.gene_ids))):
# continue
# end filter
if gene_num == '-' or discontinued_num == '-':
continue
if self.test_mode and gene_num not in self.gene_ids:
continue
if not self.test_mode and tax_num not in self.tax_ids:
continue
line_counter += 1
gene_id = ':'.join(('NCBIGene', gene_num))
discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
# add the two genes
if self.class_or_indiv.get(gene_id) == 'C':
model.addClassToGraph(gene_id, None)
model.addClassToGraph(discontinued_gene_id, discontinued_symbol)
# add the new gene id to replace the old gene id
model.addDeprecatedClass(discontinued_gene_id, [gene_id])
else:
model.addIndividualToGraph(gene_id, None)
model.addIndividualToGraph(
discontinued_gene_id, discontinued_symbol)
model.addDeprecatedIndividual(discontinued_gene_id, [gene_id])
# also add the old symbol as a synonym of the new gene
model.addSynonym(gene_id, discontinued_symbol)
if not self.test_mode and (limit is not None and line_counter > limit):
break
示例15: _get_equivids
# 需要导入模块: from dipper.models.Model import Model [as 别名]
# 或者: from dipper.models.Model.Model import addSynonym [as 别名]
def _get_equivids(self, limit):
"""
The file processed here is of the format:
#NBK_id GR_shortname OMIM
NBK1103 trimethylaminuria 136132
NBK1103 trimethylaminuria 602079
NBK1104 cdls 122470
Where each of the rows represents a mapping between
a gr id and an omim id. These are a 1:many relationship,
and some of the omim ids are genes(not diseases).
Therefore, we need to create a loose coupling here.
We make the assumption that these NBKs are generally higher-level
grouping classes; therefore the OMIM ids are treated as subclasses.
:param limit:
"""
raw = '/'.join((self.rawdir, self.files['idmap']['file']))
model = Model(self.graph)
LOG.info('Looping over %s', raw)
# we look some stuff up in OMIM, so initialize here
# omim = OMIM(self.graph_type, self.are_bnodes_skized)
id_map = {}
allomimids = set()
col = ['NBK_id', 'GR_shortname', 'OMIM']
with open(raw, 'r', encoding="utf8") as csvfile:
reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
row = next(reader)
row[0] = row[0][1:]
if not self.check_fileheader(col, row):
exit(-1)
for row in filereader:
nbk_num = row[col.index('NBK_id')]
shortname = row[col.index('GR_shortname')]
omim_num = row[col.index('OMIM')]
gr_id = 'GeneReviews:' + nbk_num
omim_id = 'OMIM:' + omim_num
if not (
(self.test_mode and
len(self.test_ids) > 0 and
omim_id in self.test_ids) or not
self.test_mode):
continue
# sometimes there's bad omim nums
omim_num = omim_num.strip()
if len(omim_num) != 6:
LOG.warning(
"OMIM number incorrectly formatted in row %i; skipping:\n%s",
filereader.line_num, '\t'.join(row))
continue
# build up a hashmap of the mappings; then process later
if nbk_num not in id_map:
id_map[nbk_num] = set()
id_map[nbk_num].add(omim_num)
# add the class along with the shortname
model.addClassToGraph(gr_id, None)
model.addSynonym(gr_id, shortname)
allomimids.add(omim_num)
if not self.test_mode and limit is not None and reader.line_num > limit:
break
# end looping through file
# given all_omim_ids from GR,
# we want to update any which are changed or removed
# before deciding which are disease / phenotypes
replaced = allomimids & self.omim_replaced.keys()
if replaced is not None and len(replaced) > 0:
LOG.warning("These OMIM ID's are past their pull date: %s", str(replaced))
for oid in replaced:
allomimids.remove(oid)
replacements = self.omim_replaced[oid]
for rep in replacements:
allomimids.update(rep)
# guard against omim identifiers which have been removed
obsolete = [
o for o in self.omim_type
if self.omim_type[o] == self.globaltt['obsolete']]
removed = allomimids & set(obsolete)
if removed is not None and len(removed) > 0:
LOG.warning("These OMIM ID's are gone: %s", str(removed))
for oid in removed:
allomimids.remove(oid)
# filter for disease /phenotype types (we can argue about what is included)
omim_phenotypes = set([
omim for omim in self.omim_type if self.omim_type[omim] in (
self.globaltt['Phenotype'],
self.globaltt['has_affected_feature'], # both a gene and a phenotype
self.globaltt['heritable_phenotypic_marker'])]) # probable phenotype
LOG.info(
"Have %i omim_ids globally typed as phenotypes from OMIM",
len(omim_phenotypes))
#.........这里部分代码省略.........