本文整理汇总了Python中dipper.models.Genotype.Genotype.addTaxon方法的典型用法代码示例。如果您正苦于以下问题:Python Genotype.addTaxon方法的具体用法?Python Genotype.addTaxon怎么用?Python Genotype.addTaxon使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.models.Genotype.Genotype
的用法示例。
在下文中一共展示了Genotype.addTaxon方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _process_genes
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
def _process_genes(self, taxid, limit=None):
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
raw = '/'.join((self.rawdir, self.files[taxid]['file']))
line_counter = 0
logger.info("Processing Ensembl genes for tax %s", taxid)
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t')
for row in filereader:
if len(row) < 4:
logger.error("Data error for file %s", raw)
return
(ensembl_gene_id, external_gene_name, description,
gene_biotype, entrezgene) = row[0:5]
# in the case of human genes, we also get the hgnc id,
# and is the last col
if taxid == '9606':
hgnc_id = row[5]
else:
hgnc_id = None
if self.testMode and entrezgene != '' \
and int(entrezgene) not in self.gene_ids:
continue
line_counter += 1
gene_id = 'ENSEMBL:'+ensembl_gene_id
if description == '':
description = None
gene_type_id = self._get_gene_type(gene_biotype)
gene_type_id = None
gu.addClassToGraph(
g, gene_id, external_gene_name, gene_type_id, description)
if entrezgene != '':
gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene)
if hgnc_id is not None and hgnc_id != '':
gu.addEquivalentClass(g, gene_id, hgnc_id)
geno.addTaxon('NCBITaxon:'+taxid, gene_id)
if not self.testMode \
and limit is not None and line_counter > limit:
break
gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
return
示例2: process_gene_ids
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
def process_gene_ids(self, limit):
raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))
if self.testMode:
g = self.testgraph
else:
g = self.graph
model = Model(g)
logger.info("Processing: %s", self.files['gene_ids']['file'])
line_counter = 0
geno = Genotype(g)
with gzip.open(raw, 'rb') as csvfile:
filereader = csv.reader(
io.TextIOWrapper(csvfile, newline=""), delimiter=',',
quotechar='\"')
for row in filereader:
line_counter += 1
(taxon_num,
gene_num,
gene_symbol,
gene_synonym,
live,
gene_type) = row
# 6239,WBGene00000001,aap-1,Y110A7A.10,Live,protein_coding_gene
if self.testMode and gene_num not in self.test_ids['gene']:
continue
taxon_id = 'NCBITaxon:'+taxon_num
gene_id = 'WormBase:'+gene_num
if gene_symbol == '':
gene_symbol = gene_synonym
if gene_symbol == '':
gene_symbol = None
model.addClassToGraph(
gene_id, gene_symbol, Genotype.genoparts['gene'])
if live == 'Dead':
model.addDeprecatedClass(gene_id)
geno.addTaxon(taxon_id, gene_id)
if gene_synonym != '' and gene_synonym is not None:
model.addSynonym(gene_id, gene_synonym)
if not self.testMode \
and limit is not None and line_counter > limit:
break
return
示例3: _process_qtls_genomic_location
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
def _process_qtls_genomic_location(
self, raw, txid, build_id, build_label, common_name, limit=None):
"""
This method
Triples created:
:param limit:
:return:
"""
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
model = Model(graph)
line_counter = 0
geno = Genotype(graph)
# assume that chrs get added to the genome elsewhere
taxon_curie = 'NCBITaxon:' + txid
eco_id = self.globaltt['quantitative trait analysis evidence']
LOG.info("Processing QTL locations for %s from %s", taxon_curie, raw)
with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile:
reader = csv.reader(tsvfile, delimiter="\t")
for row in reader:
line_counter += 1
if re.match(r'^#', ' '.join(row)):
continue
(chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand,
score, attr) = row
example = '''
Chr.Z Animal QTLdb Production_QTL 33954873 34023581...
QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234;
trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass";
MO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian";
Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52";
Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01
'''
str(example)
# make dictionary of attributes
# keys are:
# QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers,
# VTO_name,Map_Type,Significance,P-value,Model,
# Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM,
# CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect,
# Dominance_Effect,Likelihood_Ratio,LS-means,Breed,
# trait (duplicate with Name),Variance,Bayes-value,
# F-Stat,LOD-score,Additive_Effect,Dominance_Effect,
# Likelihood_Ratio,LS-means
# deal with poorly formed attributes
if re.search(r'"FlankMarkers";', attr):
attr = re.sub(r'FlankMarkers;', '', attr)
attr_items = re.sub(r'"', '', attr).split(";")
bad_attrs = set()
for attributes in attr_items:
if not re.search(r'=', attributes):
# remove this attribute from the list
bad_attrs.add(attributes)
attr_set = set(attr_items) - bad_attrs
attribute_dict = dict(item.split("=") for item in attr_set)
qtl_num = attribute_dict.get('QTL_ID')
if self.test_mode and int(qtl_num) not in self.test_ids:
continue
# make association between QTL and trait based on taxon
qtl_id = common_name + 'QTL:' + str(qtl_num)
model.addIndividualToGraph(qtl_id, None, self.globaltt['QTL'])
geno.addTaxon(taxon_curie, qtl_id)
#
trait_id = 'AQTLTrait:' + attribute_dict.get('trait_ID')
# if pub is in attributes, add it to the association
pub_id = None
if 'PUBMED_ID' in attribute_dict.keys():
pub_id = attribute_dict.get('PUBMED_ID')
if re.match(r'ISU.*', pub_id):
pub_id = 'AQTLPub:' + pub_id.strip()
reference = Reference(graph, pub_id)
else:
pub_id = 'PMID:' + pub_id.strip()
reference = Reference(
graph, pub_id, self.globaltt['journal article'])
reference.addRefToGraph()
# Add QTL to graph
assoc = G2PAssoc(
graph, self.name, qtl_id, trait_id,
self.globaltt['is marker for'])
assoc.add_evidence(eco_id)
assoc.add_source(pub_id)
if 'P-value' in attribute_dict.keys():
scr = re.sub(r'<', '', attribute_dict.get('P-value'))
if ',' in scr:
scr = re.sub(r',', '.', scr)
if scr.isnumeric():
#.........这里部分代码省略.........
示例4: _process_genes
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
#.........这里部分代码省略.........
# ucsc_id = row[col.index('ucsc_id')]
# ena = row[col.index('ena')]
# refseq_accession = row[col.index('refseq_accession')]
# ccds_id = row[col.index('ccds_id')]
# uniprot_ids = row[col.index('uniprot_ids')]
pubmed_ids = row[col.index('pubmed_id')].strip() # pipe seperated!
# mgd_id = row[col.index('mgd_id')]
# rgd_id = row[col.index('rgd_id')]
# lsdb = row[col.index('lsdb')]
# cosmic = row[col.index('cosmic')]
omim_ids = row[col.index('omim_id')].strip() # pipe seperated!
# mirbase = row[col.index('mirbase')]
# homeodb = row[col.index('homeodb')]
# snornabase = row[col.index('snornabase')]
# bioparadigms_slc = row[col.index('bioparadigms_slc')]
# orphanet = row[col.index('orphanet')]
# pseudogene.org = row[col.index('pseudogene.org')]
# horde_id = row[col.index('horde_id')]
# merops = row[col.index('merops')]
# imgt = row[col.index('imgt')]
# iuphar = row[col.index('iuphar')]
# kznf_gene_catalog = row[col.index('kznf_gene_catalog')]
# mamit_trnadb = row[col.index('mamit-trnadb')]
# cd = row[col.index('cd')]
# lncrnadb = row[col.index('lncrnadb')]
# enzyme_id = row[col.index('enzyme_id')]
# intermediate_filament_db = row[col.index('intermediate_filament_db')]
# rna_central_ids = row[col.index('rna_central_ids')]
# lncipedia = row[col.index('lncipedia')]
# gtrnadb = row[col.index('gtrnadb')]
if self.test_mode and entrez_id != '' and \
entrez_id not in self.gene_ids:
continue
if name == '':
name = None
if locus_type == 'withdrawn':
model.addDeprecatedClass(hgnc_id)
else:
gene_type_id = self.resolve(locus_type, False) # withdrawn -> None?
if gene_type_id != locus_type:
model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
model.makeLeader(hgnc_id)
if entrez_id != '':
model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id)
if ensembl_gene_id != '':
model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id)
for omim_id in omim_ids.split('|'):
if omim_id in self.omim_replaced:
repl = self.omim_replaced[omim_id]
LOG.warning('%s is replaced with %s', omim_id, repl)
for omim in repl:
if self.omim_type[omim] == self.globaltt['gene']:
omim_id = omim
if omim_id in self.omim_type and \
self.omim_type[omim_id] == self.globaltt['gene']:
model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id)
geno.addTaxon(self.hs_txid, hgnc_id)
# add pubs as "is about"
for pubmed_id in pubmed_ids.split('|'):
graph.addTriple(
'PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id)
# add chr location
# sometimes two are listed, like: 10p11.2 or 17q25
# -- there are only 2 of these FRA10A and MPFD
# sometimes listed like "1 not on reference assembly"
# sometimes listed like 10q24.1-q24.3
# sometimes like 11q11 alternate reference locus
band = chrom = None
chr_match = chr_pattern.match(location)
if chr_match is not None and len(chr_match.groups()) > 0:
chrom = chr_match.group(1)
chrom_id = makeChromID(chrom, self.hs_txid, 'CHR')
band_match = band_pattern.search(location)
feat = Feature(graph, hgnc_id, None, None)
if band_match is not None and len(band_match.groups()) > 0:
band = band_match.group(1)
band = chrom + band
# add the chr band as the parent to this gene
# as a feature but assume that the band is created
# as a class with properties elsewhere in Monochrom
band_id = makeChromID(band, self.hs_txid, 'CHR')
model.addClassToGraph(band_id, None)
feat.addSubsequenceOfFeature(band_id)
else:
model.addClassToGraph(chrom_id, None)
feat.addSubsequenceOfFeature(chrom_id)
if not self.test_mode and limit is not None and \
filereader.line_num > limit:
break
示例5: _process_haplotype
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
def _process_haplotype(
self, hap_id, hap_label, chrom_num, chrom_pos, context,
risk_allele_frequency, mapped_gene, so_ontology):
tax_id = 'NCBITaxon:9606'
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
model = Model(g)
# add the feature to the graph
hap_description = None
if risk_allele_frequency != '' and \
risk_allele_frequency != 'NR':
hap_description = \
str(risk_allele_frequency) + \
' [risk allele frequency]'
model.addIndividualToGraph(hap_id, hap_label.strip(),
Feature.types['haplotype'], hap_description)
geno.addTaxon(tax_id, hap_id)
snp_labels = re.split(r';\s?', hap_label)
chrom_nums = re.split(r';\s?', chrom_num)
chrom_positions = re.split(r';\s?', chrom_pos)
context_list = re.split(r';\s?', context)
mapped_genes = re.split(r';\s?', mapped_gene)
snp_curies = list()
for index, snp in enumerate(snp_labels):
snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
if snp_type is None:
# make blank node
snp_curie = self.make_id(snp, "_")
g.addTriple(hap_id, geno.object_properties['has_variant_part'],
snp_curie)
snp_curies.append(snp_curie)
# courtesy http://stackoverflow.com/a/16720915
length = len(snp_labels)
if not all(len(lst) == length
for lst in [chrom_nums, chrom_positions, context_list]):
logger.warn(
"Unexpected data field for haplotype {} \n "
"will not add snp details".format(hap_label))
return
variant_in_gene_count = 0
for index, snp_curie in enumerate(snp_curies):
self._add_snp_to_graph(
snp_curie, snp_labels[index], chrom_nums[index],
chrom_positions[index], context_list[index])
if len(mapped_genes) == len(snp_labels):
so_class = self._map_variant_type(context_list[index])
if so_class is None:
raise ValueError("Unknown SO class {} in haplotype {}"
.format(context_list[index], hap_label))
so_query = """
SELECT ?variant_label
WHERE {{
{0} rdfs:subClassOf+ SO:0001564 ;
rdfs:label ?variant_label .
}}
""".format(so_class)
query_result = so_ontology.query(so_query)
if len(list(query_result)) > 0:
gene_id = DipperUtil.get_ncbi_id_from_symbol(
mapped_genes[index])
if gene_id is not None:
geno.addAffectedLocus(snp_curie, gene_id)
geno.addAffectedLocus(hap_id, gene_id)
variant_in_gene_count += 1
if context_list[index] == 'upstream_gene_variant':
gene_id = DipperUtil.get_ncbi_id_from_symbol(
mapped_genes[index])
if gene_id is not None:
g.addTriple(
snp_curie,
Feature.object_properties[
'upstream_of_sequence_of'],
gene_id)
elif context_list[index] == 'downstream_gene_variant':
gene_id = DipperUtil.get_ncbi_id_from_symbol(
mapped_genes[index])
if gene_id is not None:
g.addTriple(
snp_curie,
Feature.object_properties[
'downstream_of_sequence_of'],
gene_id)
else:
logger.warn("More mapped genes than snps, "
"cannot disambiguate for {}".format(hap_label))
#.........这里部分代码省略.........
示例6: _process_haplotype
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
def _process_haplotype(
self, hap_id, hap_label, chrom_num, chrom_pos, context,
risk_allele_frequency, mapped_gene, so_ontology):
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
geno = Genotype(graph)
model = Model(graph)
# add the feature to the graph
hap_description = None
if risk_allele_frequency != '' and risk_allele_frequency != 'NR':
hap_description = str(risk_allele_frequency) + ' [risk allele frequency]'
model.addIndividualToGraph(
hap_id, hap_label.strip(), self.globaltt['haplotype'], hap_description)
geno.addTaxon(self.globaltt["Homo sapiens"], hap_id)
snp_labels = re.split(r';\s?', hap_label)
chrom_nums = re.split(r';\s?', chrom_num)
chrom_positions = re.split(r';\s?', chrom_pos)
context_list = re.split(r';\s?', context)
mapped_genes = re.split(r';\s?', mapped_gene)
snp_curies = list()
for index, snp in enumerate(snp_labels):
snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
if snp_type is None:
# make blank node
snp_curie = self.make_id(snp, "_")
graph.addTriple(hap_id, self.globaltt['has_variant_part'], snp_curie)
snp_curies.append(snp_curie)
# courtesy http://stackoverflow.com/a/16720915
length = len(snp_labels)
if not all(len(lst) == length
for lst in [chrom_nums, chrom_positions, context_list]):
LOG.warning(
"Unexpected data field for haplotype %s \n "
"will not add snp details", hap_label)
return
variant_in_gene_count = 0
for index, snp_curie in enumerate(snp_curies):
self._add_snp_to_graph(
snp_curie, snp_labels[index], chrom_nums[index],
chrom_positions[index], context_list[index])
if len(mapped_genes) == len(snp_labels):
so_class = self.resolve(context_list[index])
# removed the '+' for recursive one-or-more rdfs:subClassOf paths
# just so it did not return an empty graph
so_query = """
SELECT ?variant_label
WHERE {{
{0} rdfs:subClassOf {1} ;
rdfs:label ?variant_label .
}}
""".format(so_class, self.globaltt['gene_variant'])
query_result = so_ontology.query(so_query)
if len(list(query_result)) == 1:
gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[index])
if gene_id is not None:
geno.addAffectedLocus(snp_curie, gene_id)
geno.addAffectedLocus(hap_id, gene_id)
variant_in_gene_count += 1
gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[index])
if gene_id is not None:
graph.addTriple(
snp_curie, self.resolve(context_list[index]), gene_id)
else:
LOG.warning(
"More mapped genes than snps, cannot disambiguate for %s",
hap_label)
# Seperate in case we want to apply a different relation
# If not this is redundant with triples added above
if len(mapped_genes) == variant_in_gene_count and len(set(mapped_genes)) == 1:
gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0])
geno.addAffectedLocus(hap_id, gene_id)
return
示例7: _process_phenotype_data
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
#.........这里部分代码省略.........
limit is not None and reader.line_num > limit):
break
# now that we've collected all of the variant information, build it
# we don't know their zygosities
for s in self.strain_hash:
h = self.strain_hash.get(s)
variants = h['variants']
genes = h['genes']
vl_set = set()
# make variant loci for each gene
if len(variants) > 0:
for var in variants:
vl_id = var.strip()
vl_symbol = self.id_label_hash[vl_id]
geno.addAllele(
vl_id, vl_symbol, self.globaltt['variant_locus'])
vl_set.add(vl_id)
if len(variants) == 1 and len(genes) == 1:
for gene in genes:
geno.addAlleleOfGene(vl_id, gene)
else:
geno.addAllele(vl_id, vl_symbol)
else: # len(vars) == 0
# it's just anonymous variants in some gene
for gene in genes:
vl_id = '_:' + re.sub(r':', '', gene) + '-VL'
vl_symbol = self.id_label_hash[gene]+'<?>'
self.id_label_hash[vl_id] = vl_symbol
geno.addAllele(
vl_id, vl_symbol, self.globaltt['variant_locus'])
geno.addGene(gene, self.id_label_hash[gene])
geno.addAlleleOfGene(vl_id, gene)
vl_set.add(vl_id)
# make the vslcs
vl_list = sorted(vl_set)
vslc_list = []
for vl in vl_list:
# for unknown zygosity
vslc_id = re.sub(r'^_', '', vl)+'U'
vslc_id = re.sub(r':', '', vslc_id)
vslc_id = '_:' + vslc_id
vslc_label = self.id_label_hash[vl] + '/?'
self.id_label_hash[vslc_id] = vslc_label
vslc_list.append(vslc_id)
geno.addPartsToVSLC(
vslc_id, vl, None, self.globaltt['indeterminate'],
self.globaltt['has_variant_part'], None)
model.addIndividualToGraph(
vslc_id, vslc_label,
self.globaltt['variant single locus complement'])
if len(vslc_list) > 0:
if len(vslc_list) > 1:
gvc_id = '-'.join(vslc_list)
gvc_id = re.sub(r'_|:', '', gvc_id)
gvc_id = '_:'+gvc_id
gvc_label = '; '.join(self.id_label_hash[v] for v in vslc_list)
model.addIndividualToGraph(
gvc_id, gvc_label,
self.globaltt['genomic_variation_complement'])
for vslc_id in vslc_list:
geno.addVSLCtoParent(vslc_id, gvc_id)
else:
# the GVC == VSLC, so don't have to make an extra piece
gvc_id = vslc_list.pop()
gvc_label = self.id_label_hash[gvc_id]
genotype_label = gvc_label + ' [n.s.]'
bkgd_id = re.sub(
r':', '', '-'.join((
self.globaltt['unspecified_genomic_background'], s)))
genotype_id = '-'.join((gvc_id, bkgd_id))
bkgd_id = '_:' + bkgd_id
geno.addTaxon(mouse_taxon, bkgd_id)
geno.addGenomicBackground(
bkgd_id, 'unspecified (' + s + ')',
self.globaltt['unspecified_genomic_background'],
"A placeholder for the unspecified genetic background for " + s)
geno.addGenomicBackgroundToGenotype(
bkgd_id, genotype_id,
self.globaltt['unspecified_genomic_background'])
geno.addParts(
gvc_id, genotype_id, self.globaltt['has_variant_part'])
geno.addGenotype(genotype_id, genotype_label)
graph.addTriple(
s, self.globaltt['has_genotype'], genotype_id)
else:
# LOG.debug(
# "Strain %s is not making a proper genotype.", s)
pass
LOG.warning(
"The following gene symbols did not list identifiers: %s",
str(sorted(list(genes_with_no_ids))))
LOG.error(
'%i symbols given are missing their gene identifiers',
len(genes_with_no_ids))
return
示例8: add_orthologs_by_gene_group
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
def add_orthologs_by_gene_group(self, graph, gene_ids):
"""
This will get orthologies between human and other vertebrate genomes
based on the gene_group annotation pipeline from NCBI.
More information 9can be learned here:
http://www.ncbi.nlm.nih.gov/news/03-13-2014-gene-provides-orthologs-regions/
The method for associations is described in
[PMCID:3882889](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3882889/)
== [PMID:24063302](http://www.ncbi.nlm.nih.gov/pubmed/24063302/).
Because these are only between human and vertebrate genomes,
they will certainly miss out on very distant orthologies,
and should not be considered complete.
We do not run this within the NCBI parser itself;
rather it is a convenience function for others parsers to call.
:param graph:
:param gene_ids: Gene ids to fetch the orthology
:return:
"""
logger.info("getting gene groups")
line_counter = 0
f = '/'.join((self.rawdir, self.files['gene_group']['file']))
found_counter = 0
# because many of the orthologous groups are grouped by human gene,
# we need to do this by generating two-way hash
# group_id => orthologs
# ortholog id => group
# this will be the fastest approach, though not memory-efficient.
geno = Genotype(graph)
model = Model(graph)
group_to_orthology = {}
gene_to_group = {}
gene_to_taxon = {}
with gzip.open(f, 'rb') as csvfile:
filereader = csv.reader(
io.TextIOWrapper(csvfile, newline=""),
delimiter='\t',
quotechar='\"')
for row in filereader:
# skip comment lines
if re.match(r'\#', ''.join(row)):
continue
line_counter += 1
(tax_a, gene_a, rel, tax_b, gene_b) = row
if rel != 'Ortholog':
continue
if gene_a not in group_to_orthology:
group_to_orthology[gene_a] = set()
group_to_orthology[gene_a].add(gene_b)
if gene_b not in gene_to_group:
gene_to_group[gene_b] = set()
gene_to_group[gene_b].add(gene_a)
gene_to_taxon[gene_a] = tax_a
gene_to_taxon[gene_b] = tax_b
# also add the group lead as a member of the group
group_to_orthology[gene_a].add(gene_a)
# end loop through gene_group file
logger.debug("Finished hashing gene groups")
logger.debug("Making orthology associations")
for gid in gene_ids:
gene_num = re.sub(r'NCBIGene:', '', gid)
group_nums = gene_to_group.get(gene_num)
if group_nums is not None:
for group_num in group_nums:
orthologs = group_to_orthology.get(group_num)
if orthologs is not None:
for o in orthologs:
oid = 'NCBIGene:'+str(o)
model.addClassToGraph(
oid, None, Genotype.genoparts['gene'])
otaxid = 'NCBITaxon:'+str(gene_to_taxon[o])
geno.addTaxon(otaxid, oid)
assoc = OrthologyAssoc(graph, self.name, gid, oid)
assoc.add_source('PMID:24063302')
assoc.add_association_to_graph()
# todo get gene label for orthologs -
# this could get expensive
found_counter += 1
# finish loop through annotated genes
logger.info(
"Made %d orthology relationships for %d genes",
found_counter, len(gene_ids))
return
示例9: _process_data
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
#.........这里部分代码省略.........
genomic_background_id = None
genotype_name = vslc_name
if genomic_background_id is not None:
geno.addGenotype(
genomic_background_id, strain_name,
geno.genoparts['genomic_background'])
# make a phenotyping-center-specific strain
# to use as the background
pheno_center_strain_label = \
strain_name + '/' + phenotyping_center
pheno_center_strain_id = \
'-'.join((re.sub(r':', '', genomic_background_id),
re.sub(r'\s', '_', phenotyping_center)))
if not re.match(r'^_', pheno_center_strain_id):
pheno_center_strain_id = '_'+pheno_center_strain_id
if self.nobnodes:
pheno_center_strain_id = ':'+pheno_center_strain_id
geno.addGenotype(pheno_center_strain_id,
pheno_center_strain_label,
geno.genoparts['genomic_background'])
geno.addSequenceDerivesFrom(pheno_center_strain_id,
genomic_background_id)
# Making genotype labels from the various parts,
# can change later if desired.
# since the genotype is reflective of the place
# it got made, should put that in to disambiguate
genotype_name = \
genotype_name+' ['+pheno_center_strain_label+']'
geno.addGenomicBackgroundToGenotype(
pheno_center_strain_id, genotype_id)
geno.addTaxon(pheno_center_strain_id, taxon_id)
# this is redundant, but i'll keep in in for now
geno.addSequenceDerivesFrom(genotype_id, colony_id)
genotype_name += '['+colony+']'
geno.addGenotype(genotype_id, genotype_name)
# Make the sex-qualified genotype,
# which is what the phenotype is associated with
sex_qualified_genotype_id = \
self.make_id(
(colony_id + phenotyping_center + zygosity +
strain_accession_id+sex))
sex_qualified_genotype_label = genotype_name+' ('+sex+')'
if sex == 'male':
sq_type_id = geno.genoparts['male_genotype']
elif sex == 'female':
sq_type_id = geno.genoparts['female_genotype']
else:
sq_type_id = geno.genoparts['sex_qualified_genotype']
geno.addGenotype(
sex_qualified_genotype_id,
sex_qualified_genotype_label, sq_type_id)
geno.addParts(
genotype_id, sex_qualified_genotype_id,
geno.object_properties['has_alternate_part'])
if genomic_background_id is not None and \
genomic_background_id != '':
# Add the taxon to the genomic_background_id
geno.addTaxon(taxon_id, genomic_background_id)
else:
# add it as the genomic background
示例10: _get_gene_info
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
#.........这里部分代码省略.........
# TODO might have to figure out if things aren't genes, and make them individuals
gu.addClassToGraph(g, gene_id, label, gene_type_id, desc)
# we have to do special things here for genes, because they're classes not individuals
# f = Feature(gene_id,label,gene_type_id,desc)
if name != '-':
gu.addSynonym(g, gene_id, name)
if synonyms.strip() != '-':
for s in synonyms.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
if other_designations.strip() != '-':
for s in other_designations.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
# deal with the xrefs
# MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
if xrefs.strip() != '-':
for r in xrefs.strip().split('|'):
fixedr = self._cleanup_id(r)
if fixedr is not None and fixedr.strip() != '':
if re.match('HPRD', fixedr):
# proteins are not == genes.
gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr)
else:
# skip some of these for now
if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']:
gu.addEquivalentClass(g, gene_id, fixedr)
# edge cases of id | symbol | chr | map_loc:
# 263 AMD1P2 X|Y with Xq28 and Yq12
# 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR
# 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies
# 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR
# 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility
# 101928066 LOC101928066 1|Un - # unlocated scaffold
# 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 2C3
# 11548 Adra1b 11 11 B1.1|11 25.81 cM # mouse --> 11B1.1
# 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse
# 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse
# 323212 wu:fb92e12 19|20 - # fish
# 323368 ints10 6|18 - # fish
# 323666 wu:fc06e02 11|23 - # fish
# feel that the chr placement can't be trusted in this table when there is > 1 listed
# with the exception of human X|Y, i will only take those that align to one chr
# FIXME remove the chr mapping below when we pull in the genomic coords
if str(chr) != '-' and str(chr) != '':
if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']:
# this means that there's uncertainty in the mapping. skip it
# TODO we'll need to figure out how to deal with >1 loc mapping
logger.info('%s is non-uniquely mapped to %s. Skipping for now.', gene_id, str(chr))
continue
# X|Y Xp22.33;Yp11.3
# if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
# print('odd chr=',str(chr))
if str(chr) == 'X; Y':
chr = 'X|Y' # rewrite the PAR regions for processing
# do this in a loop to allow PAR regions like X|Y
for c in re.split('\|',str(chr)) :
geno.addChromosomeClass(c, tax_id, None) # assume that the chromosome label will get added elsewhere
mychrom = makeChromID(c, tax_num, 'CHR')
mychrom_syn = makeChromLabel(c, tax_num) # temporarily use the taxnum for the disambiguating label
gu.addSynonym(g, mychrom, mychrom_syn)
band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc)
if band_match is not None and len(band_match.groups()) > 0:
# if tax_num != '9606':
# continue
# this matches the regular kind of chrs, so make that kind of band
# not sure why this matches? chrX|Y or 10090chr12|Un"
# TODO we probably need a different regex per organism
# the maploc_id already has the numeric chromosome in it, strip it first
bid = re.sub('^'+c, '', map_loc)
maploc_id = makeChromID(c+bid, tax_num, 'CHR') # the generic location (no coordinates)
# print(map_loc,'-->',bid,'-->',maploc_id)
band = Feature(maploc_id, None, None) # Assume it's type will be added elsewhere
band.addFeatureToGraph(g)
# add the band as the containing feature
gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id)
else:
# TODO handle these cases
# examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24,
## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1, 12cen-q21, 22q13.3|22q13.3
logger.debug('not regular band pattern for %s: %s', gene_id, map_loc)
# add the gene as a subsequence of the chromosome
gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom)
geno.addTaxon(tax_id, gene_id)
if not self.testMode and limit is not None and line_counter > limit:
break
gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
return
示例11: process_gaf
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
#.........这里部分代码省略.........
if dbase == 'UniProtKB':
if id_map is not None and gene_num in id_map:
gene_id = id_map[gene_num]
uniprotid = ':'.join((dbase, gene_num))
(dbase, gene_num) = gene_id.split(':')
uniprot_hit += 1
else:
# LOG.warning(
# "UniProt id %s is without a 1:1 mapping to entrez/ensembl",
# gene_num)
uniprot_miss += 1
continue
else:
gene_num = gene_num.split(':')[-1] # last
gene_id = ':'.join((dbase, gene_num))
if self.test_mode and not(
re.match(r'NCBIGene', gene_id) and
int(gene_num) in self.test_ids):
continue
model.addClassToGraph(gene_id, gene_symbol)
if gene_name != '':
model.addDescription(gene_id, gene_name)
if gene_synonym != '':
for syn in re.split(r'\|', gene_synonym):
model.addSynonym(gene_id, syn.strip())
if re.search(r'\|', taxon):
# TODO add annotations with >1 taxon
LOG.info(
">1 taxon (%s) on line %d. skipping", taxon, line_counter)
else:
tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
geno.addTaxon(tax_id, gene_id)
assoc = Assoc(graph, self.name)
assoc.set_subject(gene_id)
assoc.set_object(go_id)
try:
eco_id = eco_map[eco_symbol]
assoc.add_evidence(eco_id)
except KeyError:
LOG.error("Evidence code (%s) not mapped", eco_symbol)
refs = re.split(r'\|', ref)
for ref in refs:
ref = ref.strip()
if ref != '':
prefix = ref.split(':')[0] # sidestep 'MGI:MGI:'
if prefix in self.localtt:
prefix = self.localtt[prefix]
ref = ':'.join((prefix, ref.split(':')[-1]))
refg = Reference(graph, ref)
if prefix == 'PMID':
ref_type = self.globaltt['journal article']
refg.setType(ref_type)
refg.addRefToGraph()
assoc.add_source(ref)
# TODO add the source of the annotations from assigned by?
rel = self.resolve(aspect, mandatory=False)
if rel is not None and aspect == rel:
if aspect == 'F' and re.search(r'contributes_to', qualifier):
assoc.set_relationship(self.globaltt['contributes to'])
示例12: OMIA
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
#.........这里部分代码省略.........
# add embryonic onset
assoc = D2PAssoc(self.name, omia_id, disease_id)
assoc.add_association_to_graph(self.g)
disease_id = None
else:
logger.info(
"No disease superclass defined for %s: %s",
omia_id, group_name)
# default to general disease FIXME this may not be desired
disease_id = 'DOID:4'
if group_summary == '':
group_summary = None
if group_name == '':
group_name = None
self.gu.addClassToGraph(
self.g, omia_id, group_name, disease_id, group_summary)
self.label_hash[omia_id] = group_name
return
def _process_gene_row(self, row):
if self.testMode and row['gene_id'] not in self.test_ids['gene']:
return
gene_id = 'NCBIGene:'+str(row['gene_id'])
self.id_hash['gene'][row['gene_id']] = gene_id
gene_label = row['symbol']
self.label_hash[gene_id] = gene_label
tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
gene_type_id = NCBIGene.map_type_of_gene(row['gene_type'])
self.gu.addClassToGraph(self.g, gene_id, gene_label, gene_type_id)
self.geno.addTaxon(tax_id, gene_id)
return
def _process_article_breed_row(self, row):
# article_id, breed_id, added_by
# don't bother putting these into the test... too many!
# and int(row['breed_id']) not in self.test_ids['breed']:
if self.testMode:
return
article_id = self.id_hash['article'].get(row['article_id'])
breed_id = self.id_hash['breed'].get(row['breed_id'])
# there's some missing data (article=6038). in that case skip
if article_id is not None:
self.gu.addTriple(
self.g, article_id, self.gu.object_properties['is_about'],
breed_id)
else:
logger.warning("Missing article key %s", str(row['article_id']))
return
def _process_article_phene_row(self, row):
"""
Linking articles to species-specific phenes.
:param row:
:return:
"""
# article_id, phene_id, added_by
示例13: _process_genes
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
def _process_genes(self, limit=None):
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
model = Model(g)
raw = '/'.join((self.rawdir, self.files['genes']['file']))
line_counter = 0
logger.info("Processing HGNC genes")
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
# curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n .
for row in filereader:
(hgnc_id,
symbol,
name,
locus_group,
locus_type,
status,
location,
location_sortable,
alias_symbol,
alias_name,
prev_symbol,
prev_name,
gene_family,
gene_family_id,
date_approved_reserved,
date_symbol_changed,
date_name_changed,
date_modified,
entrez_id,
ensembl_gene_id,
vega_id,
ucsc_id,
ena,
refseq_accession,
ccds_id,
uniprot_ids,
pubmed_id,
mgd_id,
rgd_id,
lsdb,
cosmic,
omim_id,
mirbase,
homeodb,
snornabase,
bioparadigms_slc,
orphanet,
pseudogene_org,
horde_id,
merops,
imgt,
iuphar,
kznf_gene_catalog,
mamit_trnadb,
cd,
lncrnadb,
enzyme_id,
intermediate_filament_db,
rna_central_ids) = row
line_counter += 1
# skip header
if line_counter <= 1:
continue
if self.testMode and entrez_id != '' \
and int(entrez_id) not in self.gene_ids:
continue
if name == '':
name = None
gene_type_id = self._get_gene_type(locus_type)
model.addClassToGraph(hgnc_id, symbol, gene_type_id, name)
if locus_type == 'withdrawn':
model.addDeprecatedClass(hgnc_id)
else:
model.makeLeader(hgnc_id)
if entrez_id != '':
model.addEquivalentClass(
hgnc_id, 'NCBIGene:' + entrez_id)
if ensembl_gene_id != '':
model.addEquivalentClass(
hgnc_id, 'ENSEMBL:' + ensembl_gene_id)
if omim_id != '' and "|" not in omim_id:
omim_curie = 'OMIM:' + omim_id
if not DipperUtil.is_omim_disease(omim_curie):
model.addEquivalentClass(hgnc_id, omim_curie)
geno.addTaxon('NCBITaxon:9606', hgnc_id)
# add pubs as "is about"
if pubmed_id != '':
#.........这里部分代码省略.........
示例14: _process_genes
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
def _process_genes(self, taxid, limit=None):
if self.testMode:
g = self.testgraph
else:
g = self.graph
model = Model(g)
geno = Genotype(g)
raw = '/'.join((self.rawdir, self.files[taxid]['file']))
line_counter = 0
logger.info("Processing Ensembl genes for tax %s", taxid)
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t')
for row in filereader:
if len(row) < 4:
raise ValueError("Data error for file %s", raw)
(ensembl_gene_id, external_gene_name,
description, gene_biotype, entrezgene,
peptide_id, uniprot_swissprot) = row[0:7]
# in the case of human genes, we also get the hgnc id,
# and is the last col
if taxid == '9606':
hgnc_id = row[7]
else:
hgnc_id = None
if self.testMode and entrezgene != '' \
and int(entrezgene) not in self.gene_ids:
continue
line_counter += 1
gene_id = 'ENSEMBL:' + ensembl_gene_id
peptide_curie = 'ENSEMBL:{}'.format(peptide_id)
uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot)
entrez_curie = 'NCBIGene:{}'.format(entrezgene)
if description == '':
description = None
# gene_type_id = self._get_gene_type(gene_biotype)
gene_type_id = None
model.addClassToGraph(
gene_id, external_gene_name, gene_type_id, description)
model.addIndividualToGraph(peptide_curie, None, self._get_gene_type("polypeptide"))
model.addIndividualToGraph(uniprot_curie, None, self._get_gene_type("polypeptide"))
if entrezgene != '':
model.addEquivalentClass(gene_id, entrez_curie)
if hgnc_id is not None and hgnc_id != '':
model.addEquivalentClass(gene_id, hgnc_id)
geno.addTaxon('NCBITaxon:'+taxid, gene_id)
if peptide_id != '':
geno.addGeneProduct(gene_id, peptide_curie)
if uniprot_swissprot != '':
geno.addGeneProduct(gene_id, uniprot_curie)
model.addXref(peptide_curie, uniprot_curie)
if not self.testMode \
and limit is not None and line_counter > limit:
break
return
示例15: process_gaf
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addTaxon [as 别名]
def process_gaf(self, file, limit, id_map=None):
if self.testMode:
g = self.testgraph
else:
g = self.graph
model = Model(g)
geno = Genotype(g)
logger.info("Processing Gene Associations from %s", file)
line_counter = 0
if 7955 in self.tax_ids:
zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
elif 6239 in self.tax_ids:
wbase = WormBase(self.graph_type, self.are_bnodes_skized)
with gzip.open(file, 'rb') as csvfile:
filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
# comments start with exclamation
if re.match(r'!', ''.join(row)):
continue
(db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol,
with_or_from, aspect, gene_name, gene_synonym, object_type,
taxon, date, assigned_by, annotation_extension,
gene_product_form_id) = row
# test for required fields
if (db == '' or gene_num == '' or gene_symbol == '' or
go_id == '' or ref == '' or eco_symbol == '' or
aspect == '' or object_type == '' or taxon == '' or
date == '' or assigned_by == ''):
logger.error(
"Missing required part of annotation " +
"on row %d:\n"+'\t'.join(row),
line_counter)
continue
# deal with qualifier NOT, contributes_to, colocalizes_with
if re.search(r'NOT', qualifier):
continue
db = self.clean_db_prefix(db)
uniprotid = None
gene_id = None
if db == 'UniProtKB':
mapped_ids = id_map.get(gene_num)
if id_map is not None and mapped_ids is not None:
if len(mapped_ids) == 1:
gene_id = mapped_ids[0]
uniprotid = ':'.join((db, gene_num))
gene_num = re.sub(r'\w+\:', '', gene_id)
elif len(mapped_ids) > 1:
# logger.warning(
# "Skipping gene id mapped for >1 gene %s -> %s",
# gene_num, str(mapped_ids))
continue
else:
continue
elif db == 'MGI':
gene_num = re.sub(r'MGI:', '', gene_num)
gene_id = ':'.join((db, gene_num))
gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id)
else:
gene_id = ':'.join((db, gene_num))
if self.testMode \
and not(
re.match(r'NCBIGene', gene_id) and
int(gene_num) in self.test_ids):
continue
model.addClassToGraph(gene_id, gene_symbol)
if gene_name != '':
model.addDescription(gene_id, gene_name)
if gene_synonym != '':
for s in re.split(r'\|', gene_synonym):
model.addSynonym(gene_id, s.strip())
if re.search(r'\|', taxon):
# TODO add annotations with >1 taxon
logger.info(">1 taxon (%s) on line %d. skipping", taxon,
line_counter)
else:
tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
geno.addTaxon(tax_id, gene_id)
assoc = Assoc(g, self.name)
assoc.set_subject(gene_id)
assoc.set_object(go_id)
eco_id = self.map_go_evidence_code_to_eco(eco_symbol)
if eco_id is not None:
assoc.add_evidence(eco_id)
refs = re.split(r'\|', ref)
for r in refs:
#.........这里部分代码省略.........