本文整理汇总了Python中dipper.utils.GraphUtils.GraphUtils.addTriple方法的典型用法代码示例。如果您正苦于以下问题:Python GraphUtils.addTriple方法的具体用法?Python GraphUtils.addTriple怎么用?Python GraphUtils.addTriple使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.utils.GraphUtils.GraphUtils
的用法示例。
在下文中一共展示了GraphUtils.addTriple方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_process_allelic_variants
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
def _get_process_allelic_variants(self, entry, g):
gu = GraphUtils(curie_map.get())
geno = Genotype(g)
du = DipperUtil()
if entry is not None:
publist = {} # to hold the entry-specific publication mentions for the allelic variants
entry_num = entry['mimNumber']
# process the ref list just to get the pmids
ref_to_pmid = self._get_pubs(entry, g)
if 'allelicVariantList' in entry:
allelicVariantList = entry['allelicVariantList']
for al in allelicVariantList:
al_num = al['allelicVariant']['number']
al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4)
al_label = None
al_description = None
if al['allelicVariant']['status'] == 'live':
publist[al_id] = set()
if 'mutations' in al['allelicVariant']:
al_label = al['allelicVariant']['mutations']
if 'text' in al['allelicVariant']:
al_description = al['allelicVariant']['text']
m = re.findall('\{(\d+)\:', al_description)
publist[al_id] = set(m)
geno.addAllele(al_id, al_label, geno.genoparts['variant_locus'], al_description)
geno.addAlleleOfGene(al_id, 'OMIM:'+str(entry_num),
geno.object_properties['is_sequence_variant_instance_of'])
for r in publist[al_id]:
pmid = ref_to_pmid[int(r)]
gu.addTriple(g, pmid, gu.object_properties['is_about'], al_id)
# look up the pubmed id in the list of references
if 'dbSnps' in al['allelicVariant']:
dbsnp_ids = re.split(',', al['allelicVariant']['dbSnps'])
for dnum in dbsnp_ids:
did = 'dbSNP:'+dnum.strip()
gu.addIndividualToGraph(g, did, None)
gu.addEquivalentClass(g, al_id, did)
if 'clinvarAccessions' in al['allelicVariant']:
# clinvarAccessions triple semicolon delimited, each lik eRCV000020059;;1
rcv_ids = re.split(';;;', al['allelicVariant']['clinvarAccessions'])
rcv_ids = [(re.match('(RCV\d+)\;\;', r)).group(1) for r in rcv_ids]
for rnum in rcv_ids:
rid = 'ClinVar:'+rnum
gu.addXref(g, al_id, rid)
gu.addPage(g, al_id, "http://omim.org/entry/"+str(entry_num)+"#"+str(al_num).zfill(4))
elif re.search('moved', al['allelicVariant']['status']):
# for both 'moved' and 'removed'
moved_ids = None
if 'movedTo' in al['allelicVariant']:
moved_id = 'OMIM:'+al['allelicVariant']['movedTo']
moved_ids = [moved_id]
gu.addDeprecatedIndividual(g, al_id, moved_ids)
else:
logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status'])
# end loop allelicVariantList
return
示例2: _process_pathway_disease
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
def _process_pathway_disease(self, limit):
"""
We make a link between the pathway identifiers,
and any diseases associated with them.
Since we model diseases as processes, we make a triple saying that
the pathway may be causally upstream of or within the disease process.
:param limit:
:return:
"""
logger.info("Processing KEGG pathways to disease ids")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
gu = GraphUtils(curie_map.get())
raw = '/'.join((self.rawdir, self.files['pathway_disease']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(disease_id, kegg_pathway_num) = row
if self.testMode and \
kegg_pathway_num not in self.test_ids['pathway']:
continue
disease_id = 'KEGG-'+disease_id
# will look like KEGG-path:map04130 or KEGG-path:hsa04130
pathway_id = 'KEGG-'+kegg_pathway_num
gu.addTriple(
g, pathway_id,
GraphUtils.object_properties[
'causally_upstream_of_or_within'],
disease_id)
if not self.testMode and \
limit is not None and line_counter > limit:
break
return
示例3: _get_pubs
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
def _get_pubs(self, entry, g):
"""
Extract mentioned publications from the reference list
:param entry:
:return:
"""
ref_to_pmid = {}
du = DipperUtil()
entry_num = entry['mimNumber']
gu = GraphUtils(curie_map.get())
if 'referenceList' in entry:
reflist = entry['referenceList']
for r in reflist:
if 'pubmedID' in r['reference']:
pub_id = 'PMID:' + str(r['reference']['pubmedID'])
ref = Reference(pub_id, Reference.ref_types['journal_article'])
else:
# make blank node for internal reference
pub_id = '_OMIM' + str(entry_num) + 'ref' + str(r['reference']['referenceNumber'])
if self.nobnodes:
pub_id = ':' + pub_id
ref = Reference(pub_id)
title = author_list = source = citation = None
if 'title' in r['reference']:
title = r['reference']['title']
ref.setTitle(title)
if 'authors' in r['reference']:
author_list = r['reference']['authors']
ref.setAuthorList(author_list)
citation = re.split('\.\,', author_list)[0] + ' et al'
if 'source' in r['reference']:
source = r['reference']['source']
citation = '; '.join(du.flatten([citation, title, source]))
ref.setShortCitation(citation)
ref.addRefToGraph(g)
ref_to_pmid[r['reference']['referenceNumber']] = pub_id
# add is_about for the pub
omim_id = 'OMIM:'+str(entry_num)
gu.addTriple(g, omim_id, gu.object_properties['mentions'], pub_id)
return ref_to_pmid
示例4: _process_pathway_pubmed
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
def _process_pathway_pubmed(self, limit):
"""
Indicate that a pathway is annotated directly to a paper (is about)
via it's pubmed id.
:param limit:
:return:
"""
logger.info("Processing KEGG pathways to pubmed ids")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
gu = GraphUtils(curie_map.get())
raw = '/'.join((self.rawdir, self.files['pathway_pubmed']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(pubmed_id, kegg_pathway_num) = row
if self.testMode and \
kegg_pathway_num not in self.test_ids['pathway']:
continue
pubmed_id = pubmed_id.upper()
# will look like KEGG-path:map04130
kegg_id = 'KEGG-'+kegg_pathway_num
r = Reference(
pubmed_id, Reference.ref_types['journal_article'])
r.addRefToGraph(g)
gu.addTriple(g, pubmed_id,
GraphUtils.object_properties['is_about'], kegg_id)
if not self.testMode and \
limit is not None and line_counter > limit:
break
return
示例5: _process_data
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
#.........这里部分代码省略.........
geno.genoparts['genomic_variation_complement'])
# add the gvc to the genotype
if genotype_id is not None:
if affected == 'unaffected':
rel = \
geno.object_properties[
'has_reference_part']
else:
rel = \
geno.object_properties[
'has_alternate_part']
geno.addParts(gvc_id, genotype_id, rel)
if karyotype_id is not None \
and self._is_normal_karyotype(karyotype):
if gvc_label is not None and gvc_label != '':
genotype_label = \
'; '.join((gvc_label, karyotype))
else:
genotype_label = karyotype
if genotype_id is None:
genotype_id = karyotype_id
else:
geno.addParts(
karyotype_id, genotype_id,
geno.object_properties[
'has_reference_part'])
else:
genotype_label = gvc_label
# use the catalog id as the background
genotype_label += ' ['+catalog_id.strip()+']'
if genotype_id is not None and gvc_id is not None:
# only add the genotype if it has some parts
geno.addGenotype(
genotype_id, genotype_label,
geno.genoparts['intrinsic_genotype'])
geno.addTaxon(taxon, genotype_id)
# add that the patient has the genotype
# TODO check if the genotype belongs to
# the cell line or to the patient
gu.addTriple(
g, patient_id,
geno.properties['has_genotype'], genotype_id)
else:
geno.addTaxon(taxon, patient_id)
# TODO: Add sex/gender (as part of the karyotype?)
# ############# DEAL WITH THE DISEASES #############
# we associate the disease to the patient
if affected == 'affected':
if omim_number != '':
for d in omim_number.split(';'):
if d is not None and d != '':
# if the omim number is in omim_map,
# then it is a gene not a pheno
if d not in omim_map:
disease_id = 'OMIM:'+d.strip()
# assume the label is taken care of
gu.addClassToGraph(g, disease_id, None)
# add the association:
# the patient has the disease
assoc = G2PAssoc(
self.name, patient_id, disease_id)
assoc.add_association_to_graph(g)
# this line is a model of this disease
# TODO abstract out model into
# it's own association class?
gu.addTriple(
g, cell_line_id,
gu.properties['model_of'],
disease_id)
else:
logger.info(
'removing %s from disease list ' +
'since it is a gene', d)
# ############# ADD PUBLICATIONS #############
if pubmed_ids != '':
for s in pubmed_ids.split(';'):
pubmed_id = 'PMID:'+s.strip()
ref = Reference(pubmed_id)
ref.setType(Reference.ref_types['journal_article'])
ref.addRefToGraph(g)
gu.addTriple(
g, pubmed_id, gu.properties['mentions'],
cell_line_id)
if not self.testMode \
and (limit is not None and line_counter > limit):
break
Assoc(self.name).load_all_properties(g)
return
示例6: Pathway
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
class Pathway():
"""
This provides convenience methods to deal with gene and protein collections
in the context of pathways.
"""
pathway_parts = {
'signal_transduction': 'GO:0007165',
'cellular_process': 'GO:0009987',
'pathway': 'PW:0000001',
'gene_product': 'CHEBI:33695' # bioinformation molecule
}
object_properties = {
'involved_in': 'RO:0002331',
'gene_product_of': 'RO:0002204',
'has_gene_product': 'RO:0002205'
}
properties = object_properties.copy()
def __init__(self, graph, nobnodes=False):
self.gu = GraphUtils(curie_map.get())
self.graph = graph
self.nobnodes = nobnodes
self.gu.loadProperties(self.graph, self.object_properties,
self.gu.OBJPROP)
return
def addPathway(
self, pathway_id, pathway_label, pathway_type=None,
pathway_description=None):
"""
Adds a pathway as a class. If no specific type is specified, it will
default to a subclass of "GO:cellular_process" and "PW:pathway".
:param pathway_id:
:param pathway_label:
:param pathway_type:
:param pathway_description:
:return:
"""
if pathway_type is None:
pathway_type = self.pathway_parts['cellular_process']
self.gu.addClassToGraph(
self.graph, pathway_id, pathway_label, pathway_type,
pathway_description)
self.gu.addSubclass(
self.graph, self.pathway_parts['pathway'], pathway_id)
return
def addGeneToPathway(self, pathway_id, gene_id):
"""
When adding a gene to a pathway, we create an intermediate
'gene product' that is involved in
the pathway, through a blank node.
gene_id RO:has_gene_product _gene_product
_gene_product RO:involved_in pathway_id
:param pathway_id:
:param gene_id:
:return:
"""
gene_product = '_'+re.sub(r':', '', gene_id)+'product'
if self.nobnodes:
gene_product = ':'+gene_product
self.gu.addIndividualToGraph(
self.graph, gene_product, None,
self.pathway_parts['gene_product'])
self.gu.addTriple(
self.graph, gene_id,
self.object_properties['has_gene_product'],
gene_product)
self.addComponentToPathway(pathway_id, gene_product)
return
def addComponentToPathway(self, pathway_id, component_id):
"""
This can be used directly when the component is directly involved in
the pathway. If a transforming event is performed on the component
first, then the addGeneToPathway should be used instead.
:param pathway_id:
:param component_id:
:return:
"""
self.gu.addTriple(self.graph, component_id,
self.object_properties['involved_in'], pathway_id)
return
示例7: _process_phenotype_data
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
#.........这里部分代码省略.........
geno.genoparts['variant_locus'])
vl_set.add(vl_id)
if len(variants) == 1 and len(genes) == 1:
for gene in genes:
geno.addAlleleOfGene(vl_id, gene)
else:
geno.addAllele(vl_id, vl_symbol)
else: # len(vars) == 0
# it's just anonymous variants in some gene
for gene in genes:
vl_id = '_'+gene+'-VL'
vl_id = re.sub(r':', '', vl_id)
if self.nobnodes:
vl_id = ':'+vl_id
vl_symbol = self.id_label_hash[gene]+'<?>'
self.id_label_hash[vl_id] = vl_symbol
geno.addAllele(vl_id, vl_symbol,
geno.genoparts['variant_locus'])
geno.addGene(gene, self.id_label_hash[gene])
geno.addAlleleOfGene(vl_id, gene)
vl_set.add(vl_id)
# make the vslcs
vl_list = sorted(vl_set)
vslc_list = []
for vl in vl_list:
# for unknown zygosity
vslc_id = '_'+re.sub(r'^_', '', vl)+'U'
vslc_id = re.sub(r':', '', vslc_id)
if self.nobnodes:
vslc_id = ':' + vslc_id
vslc_label = self.id_label_hash[vl] + '/?'
self.id_label_hash[vslc_id] = vslc_label
vslc_list.append(vslc_id)
geno.addPartsToVSLC(
vslc_id, vl, None, geno.zygosity['indeterminate'],
geno.object_properties['has_alternate_part'], None)
gu.addIndividualToGraph(
g, vslc_id, vslc_label,
geno.genoparts['variant_single_locus_complement'])
if len(vslc_list) > 0:
if len(vslc_list) > 1:
gvc_id = '-'.join(vslc_list)
gvc_id = re.sub(r':', '', gvc_id)
if self.nobnodes:
gvc_id = ':'+gvc_id
gvc_label = \
'; '.join(self.id_label_hash[v] for v in vslc_list)
gu.addIndividualToGraph(
g, gvc_id, gvc_label,
geno.genoparts['genomic_variation_complement'])
for vslc_id in vslc_list:
geno.addVSLCtoParent(vslc_id, gvc_id)
else:
# the GVC == VSLC, so don't have to make an extra piece
gvc_id = vslc_list.pop()
gvc_label = self.id_label_hash[gvc_id]
genotype_label = gvc_label + ' [n.s.]'
bkgd_id = \
'_' + re.sub(r':', '', '-'.join(
(geno.genoparts['unspecified_genomic_background'],
s)))
genotype_id = '-'.join((gvc_id, bkgd_id))
if self.nobnodes:
bkgd_id = ':'+bkgd_id
geno.addTaxon(mouse_taxon, bkgd_id)
geno.addGenomicBackground(
bkgd_id, 'unspecified ('+s+')',
geno.genoparts['unspecified_genomic_background'],
"A placeholder for the " +
"unspecified genetic background for "+s)
geno.addGenomicBackgroundToGenotype(
bkgd_id, genotype_id,
geno.genoparts['unspecified_genomic_background'])
geno.addParts(
gvc_id, genotype_id,
geno.object_properties['has_alternate_part'])
geno.addGenotype(genotype_id, genotype_label)
gu.addTriple(
g, s, geno.object_properties['has_genotype'],
genotype_id)
else:
# logger.debug(
# "Strain %s is not making a proper genotype.", s)
pass
gu.loadProperties(
g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
gu.loadProperties(
g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
gu.loadProperties(
g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
gu.loadAllProperties(g)
logger.warning(
"The following gene symbols did not list identifiers: %s",
str(sorted(list(genes_with_no_ids))))
return
示例8: Feature
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
#.........这里部分代码省略.........
if add_region:
# create a region that has the begin/end positions
regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
if region_id is None:
# in case the values are undefined
# if we know only one of the coordinates,
# then we'll add an "unknown" other.
st = sp = 'UN'
strand = None
if self.start is not None and \
self.start['coordinate'] is not None:
st = str(self.start['coordinate'])
strand = self._getStrandStringFromPositionTypes(
self.start['type'])
if self.stop is not None and\
self.stop['coordinate'] is not None:
sp = str(self.stop['coordinate'])
if strand is not None:
strand = self._getStrandStringFromPositionTypes(
self.stop['type'])
# assume that the strand is the same for both start and stop.
# this will need to be fixed in the future
region_items = [regionchr, st, sp]
if strand is not None:
region_items += [strand]
region_id = '-'.join(region_items)
rid = region_id
rid = re.sub(r'\w+\:', '', rid, 1) # replace the id prefix
rid = '_'+rid+"-Region"
region_id = rid
if self.nobnodes:
region_id = ':'+region_id
self.gu.addTriple(graph, self.id, self.properties['location'],
region_id)
self.gu.addIndividualToGraph(
graph, region_id, None, 'faldo:Region')
else:
region_id = self.id
self.gu.addType(graph, region_id, 'faldo:Region')
# add the start/end positions to the region
beginp = endp = None
if self.start is not None:
beginp = self._makePositionId(self.start['reference'],
self.start['coordinate'],
self.start['type'])
self.addPositionToGraph(graph,
self.start['reference'],
self.start['coordinate'],
self.start['type'])
if self.stop is not None:
endp = self._makePositionId(self.stop['reference'],
self.stop['coordinate'],
self.stop['type'])
self.addPositionToGraph(graph,
self.stop['reference'],
self.stop['coordinate'],
self.stop['type'])
self.addRegionPositionToGraph(graph, region_id, beginp, endp)
# {coordinate : integer, reference : reference_id, types = []}
return
示例9: MPD
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
#.........这里部分代码省略.........
:param phenotypes: a list of phenotypes to association with the strain
:param comment:
:return:
"""
eco_id = "ECO:0000059" # experimental_phenotypic_evidence
strain_label = self.idlabel_hash.get(strain_id)
# strain genotype
genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), 'genotype'))
genotype_label = '['+strain_label+']'
sex_specific_genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id),
sex, 'genotype'))
if strain_label is not None:
sex_specific_genotype_label = strain_label + ' (' + sex + ')'
else:
sex_specific_genotype_label = strain_id + '(' + sex + ')'
if self.nobnodes:
genotype_id = ':'+genotype_id
sex_specific_genotype_id = ':'+sex_specific_genotype_id
genotype_type = Genotype.genoparts['sex_qualified_genotype']
if sex == 'm':
genotype_type = Genotype.genoparts['male_genotype']
elif sex == 'f':
genotype_type = Genotype.genoparts['female_genotype']
# add the genotype to strain connection
self.geno.addGenotype(
genotype_id, genotype_label,
Genotype.genoparts['genomic_background'])
self.gu.addTriple(
g, strain_id,
Genotype.object_properties['has_genotype'], genotype_id)
self.geno.addGenotype(
sex_specific_genotype_id, sex_specific_genotype_label,
genotype_type)
# add the strain as the background for the genotype
self.gu.addTriple(
g, sex_specific_genotype_id,
Genotype.object_properties['has_sex_agnostic_genotype_part'],
genotype_id)
# ############# BUILD THE G2P ASSOC #############
# TODO add more provenance info when that model is completed
if phenotypes is not None:
for phenotype_id in phenotypes:
assoc = G2PAssoc(
self.name, sex_specific_genotype_id, phenotype_id)
assoc.add_evidence(assay_id)
assoc.add_evidence(eco_id)
assoc.add_association_to_graph(g)
assoc_id = assoc.get_association_id()
self.gu.addComment(g, assoc_id, comment)
return
def getTestSuite(self):
import unittest
from tests.test_mpd import MPDTestCase
示例10: Genotype
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
#.........这里部分代码省略.........
if (allele_type is None):
allele_type = self.genoparts['allele'] #TODO is this a good idea?
self.gu.addIndividualToGraph(self.graph, allele_id, allele_label, allele_type, allele_description)
return
def addGene(self, gene_id, gene_label, gene_type=None, gene_description=None):
if gene_type is None:
gene_type = self.genoparts['gene']
# genes are classes
self.gu.addClassToGraph(self.graph, gene_id, gene_label, gene_type, gene_description)
return
def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None):
# TODO add base type for construct
# if (constrcut_type is None):
# constrcut_type=self.construct_base_type
self.gu.addIndividualToGraph(self.graph, construct_id, construct_label, construct_type, construct_description)
return
def addDerivesFrom(self, child_id, parent_id):
"""
We add a derives_from relationship between the child and parent id. Examples of uses include between:
an allele and a construct or strain here, a cell line and it's parent genotype. Adding the
parent and child to the graph should happen outside of this function call to
ensure graph integrity.
:param child_id:
:param parent_id:
:return:
"""
self.gu.addTriple(self.graph, child_id, self.properties['derives_from'], parent_id)
return
def addSequenceDerivesFrom(self, child_id, parent_id):
self.gu.addTriple(self.graph, child_id, self.properties['derives_sequence_from_gene'], parent_id)
return
def addAlleleOfGene(self, allele_id, gene_id, rel_id=None):
"""
We make the assumption here that if the relationship is not provided, it is a
GENO:is_sequence_variant_instance_of.
Here, the allele should be a variant_locus, not a sequence alteration.
:param allele_id:
:param gene_id:
:param rel_id:
:return:
"""
if (rel_id is None):
rel_id = self.properties['is_sequence_variant_instance_of']
self.gu.addTriple(self.graph, allele_id, rel_id, gene_id)
return
def addTranscript(self, variant_id, transcript_id, transcript_label=None, transcript_type=None):
"""
Add gene/variant/allele transcribes_to relationship
:param variant_id:
:param transcript_id:
:param transcript_label:
:param transcript_type:
:return:
"""
示例11: _get_var_citations
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
def _get_var_citations(self, limit):
# Generated weekly, the first of the week
# A tab-delimited report of citations associated with data in ClinVar, connected to the AlleleID, the VariationID, and either rs# from dbSNP or nsv in dbVar.
#
# AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML)
# VariationID The identifier ClinVar uses to anchor its default display. (in the XML, //MeasureSet/@ID)
# rs rs identifier from dbSNP
# nsv nsv identifier from dbVar
# citation_source The source of the citation, either PubMed, PubMedCentral, or the NCBI Bookshelf
# citation_id The identifier used by that source
gu = GraphUtils(curie_map.get())
logger.info("Processing Citations for variants")
line_counter = 0
myfile = '/'.join((self.rawdir, self.files['variant_citations']['file']))
if self.testMode:
g = self.testgraph
else:
g = self.graph
with open(myfile, 'r', encoding="utf8") as f:
filereader = csv.reader(f, delimiter='\t', quotechar='\"')
for line in filereader:
# skip comments
line = line
if re.match('^#', line[0]):
continue
(allele_num, variant_num, rs_num, nsv_num, citation_source, citation_id) = line
line_counter += 1
if self.testMode:
if int(variant_num) not in self.variant_ids:
continue
if citation_id.strip() == '':
logger.info("Skipping blank citation for ClinVarVariant:%s", str(variant_num))
continue
# the citation for a variant is made to some kind of combination of the ids here.
# but i'm not sure which we don't know what the citation is for exactly, other
# than the variant. so use mentions
var_id = 'ClinVarVariant:'+variant_num
# citation source: PubMed | PubMedCentral | citation_source
# citation id:
# format the citation id:
ref_id = None
if citation_source == 'PubMed':
ref_id = 'PMID:'+str(citation_id)
elif citation_source == 'PubMedCentral':
ref_id = 'PMCID:'+str(citation_id)
if ref_id is not None:
r = Reference(ref_id, Reference.ref_types['journal_article'])
r.addRefToGraph(g)
gu.addTriple(g, ref_id, self.properties['is_about'], var_id)
if not self.testMode and (limit is not None and line_counter > limit):
break
logger.info("Finished processing citations for variants")
return
示例12: GeneReviews
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
#.........这里部分代码省略.........
# figure out if the book is there; if so, process, otherwise skip
book_dir = '/'.join((self.rawdir, 'books'))
book_files = os.listdir(book_dir)
if ''.join((nbk, '.html')) not in book_files:
# logger.warning("No book found locally for %s; skipping", nbk)
books_not_found.add(nbk)
continue
logger.info("Processing %s", nbk)
page = open(url)
soup = BeautifulSoup(page.read())
# sec0 == clinical description
clin_summary = \
soup.find(
'div', id=re.compile(".*Summary.sec0"))
if clin_summary is not None:
p = clin_summary.find('p')
ptext = p.text
ptext = re.sub(r'\s+', ' ', ptext)
ul = clin_summary.find('ul')
if ul is not None:
item_text = list()
for li in ul.find_all('li'):
item_text.append(re.sub(r'\s+', ' ', li.text))
ptext += ' '.join(item_text)
# add in the copyright and citation info to description
ptext = \
' '.join(
(ptext,
'[GeneReviews:NBK1116, GeneReviews:NBK138602, ' +
nbk_id+']'))
self.gu.addDefinition(self.graph, nbk_id, ptext.strip())
# get the pubs
pmid_set = set()
pub_div = soup.find('div', id=re.compile(r".*Literature_Cited"))
if pub_div is not None:
ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"})
for r in ref_list:
for a in r.find_all(
'a', attrs={'href': re.compile(r"pubmed")}):
if re.match(r'PubMed:', a.text):
pmnum = re.sub(r'PubMed:\s*', '', a.text)
else:
pmnum = \
re.search(
r'\/pubmed\/(\d+)$', a['href']).group(1)
if pmnum is not None:
pmid = 'PMID:'+str(pmnum)
self.gu.addTriple(
self.graph, pmid,
self.gu.object_properties['is_about'],
nbk_id)
pmid_set.add(pmnum)
r = Reference(
pmid, Reference.ref_types['journal_article'])
r.addRefToGraph(self.graph)
# TODO add author history, copyright, license to dataset
# TODO get PMID-NBKID equivalence (near foot of page),
# and make it "is about" link
# self.gu.addTriple(
# self.graph, pmid,
# self.gu.object_properties['is_about'], nbk_id)
# for example: NBK1191 PMID:20301370
# add the book to the dataset
self.dataset.setFileAccessUrl(book_item['url'])
if limit is not None and c > limit:
break
# finish looping through books
l = len(books_not_found)
if len(books_not_found) > 0:
if l > 100:
logger.warning("There were %d books not found.", l)
else:
logger.warning(
"The following %d books were not found locally: %s",
l, str(books_not_found))
logger.info(
"Finished processing %d books for clinical descriptions", c-l)
return
def getTestSuite(self):
import unittest
from tests.test_genereviews import GeneReviewsTestCase
test_suite = \
unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase)
return test_suite
示例13: _process_data
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
#.........这里部分代码省略.........
# http://ki.mit.edu/sbc/escell/services/details
# here, we'll make a genotype
# that derives from an ES cell with a given allele.
# the strain is not really attached to the colony.
# the colony/clone is reflective of the allele,
# with unknown zygosity
stem_cell_class = 'ERO:0002002'
gu.addIndividualToGraph(g, colony_id, colony, stem_cell_class)
# vslc of the colony has unknown zygosity
# note that we will define the allele
# (and it's relationship to the marker, etc.) later
# FIXME is it really necessary to create this vslc
# when we always know it's unknown zygosity?
vslc_colony = \
'_'+allele_accession_id+geno.zygosity['indeterminate']
vslc_colony = re.sub(r':', '', vslc_colony)
if self.nobnodes:
vslc_colony = ':'+vslc_colony
vslc_colony_label = allele_symbol+'/<?>'
# for ease of reading, we make the colony genotype variables.
# in the future, it might be desired to keep the vslcs
colony_genotype_id = vslc_colony
colony_genotype_label = vslc_colony_label
geno.addGenotype(colony_genotype_id, colony_genotype_label)
geno.addParts(allele_accession_id, colony_genotype_id,
geno.object_properties['has_alternate_part'])
geno.addPartsToVSLC(
vslc_colony, allele_accession_id, None,
geno.zygosity['indeterminate'],
geno.object_properties['has_alternate_part'])
gu.addTriple(
g, colony_id,
geno.object_properties['has_genotype'],
colony_genotype_id)
# ########## BUILD THE ANNOTATED GENOTYPE ##########
# now, we'll build the genotype of the individual that derives
# from the colony/clone genotype that is attached to
# phenotype = colony_id + strain + zygosity + sex
# (and is derived from a colony)
# this is a sex-agnostic genotype
genotype_id = \
self.make_id(
(colony_id + phenotyping_center + zygosity +
strain_accession_id))
geno.addSequenceDerivesFrom(genotype_id, colony_id)
# build the VSLC of the sex-agnostic genotype
# based on the zygosity
allele1_id = allele_accession_id
allele2_id = allele2_rel = None
allele1_label = allele_symbol
allele2_label = '<?>'
# Making VSLC labels from the various parts,
# can change later if desired.
if zygosity == 'heterozygote':
allele2_label = re.sub(r'<.*', '<+>', allele1_label)
allele2_id = None
elif zygosity == 'homozygote':
allele2_label = allele1_label
allele2_id = allele1_id
allele2_rel = geno.object_properties['has_alternate_part']
示例14: OMIA
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
#.........这里部分代码省略.........
self.g, omia_id, group_name, disease_id, group_summary)
self.label_hash[omia_id] = group_name
return
def _process_gene_row(self, row):
if self.testMode and row['gene_id'] not in self.test_ids['gene']:
return
gene_id = 'NCBIGene:'+str(row['gene_id'])
self.id_hash['gene'][row['gene_id']] = gene_id
gene_label = row['symbol']
self.label_hash[gene_id] = gene_label
tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
gene_type_id = NCBIGene.map_type_of_gene(row['gene_type'])
self.gu.addClassToGraph(self.g, gene_id, gene_label, gene_type_id)
self.geno.addTaxon(tax_id, gene_id)
return
def _process_article_breed_row(self, row):
# article_id, breed_id, added_by
# don't bother putting these into the test... too many!
# and int(row['breed_id']) not in self.test_ids['breed']:
if self.testMode:
return
article_id = self.id_hash['article'].get(row['article_id'])
breed_id = self.id_hash['breed'].get(row['breed_id'])
# there's some missing data (article=6038). in that case skip
if article_id is not None:
self.gu.addTriple(
self.g, article_id, self.gu.object_properties['is_about'],
breed_id)
else:
logger.warning("Missing article key %s", str(row['article_id']))
return
def _process_article_phene_row(self, row):
"""
Linking articles to species-specific phenes.
:param row:
:return:
"""
# article_id, phene_id, added_by
# look up the article in the hashmap
phenotype_id = self.id_hash['phene'].get(row['phene_id'])
article_id = self.id_hash['article'].get(row['article_id'])
omia_id = self._get_omia_id_from_phene_id(phenotype_id)
if self.testMode and omia_id not in self.test_ids['disease'] \
or phenotype_id is None or article_id is None:
return
# make a triple, where the article is about the phenotype
self.gu.addTriple(
self.g, article_id,
self.gu.object_properties['is_about'], phenotype_id)
return
def _process_breed_phene_row(self, row):
示例15: _get_gene2pubmed
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import addTriple [as 别名]
def _get_gene2pubmed(self, limit):
"""
Loops through the gene2pubmed file and adds a simple triple to say
that a given publication is_about a gene.
Publications are added as NamedIndividuals.
These are filtered on the taxon.
:param limit:
:return:
"""
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
logger.info("Processing Gene records")
line_counter = 0
myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file']))
logger.info("FILE: %s", myfile)
assoc_counter = 0
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match(r'^#', line):
continue
(tax_num, gene_num, pubmed_num) = line.split('\t')
# ## set filter=None in init if you don't want to have a filter
# if self.filter is not None:
# if ((self.filter == 'taxids' and \
# (int(tax_num) not in self.tax_ids))
# or (self.filter == 'geneids' and \
# (int(gene_num) not in self.gene_ids))):
# continue
# #### end filter
if self.testMode and int(gene_num) not in self.gene_ids:
continue
if not self.testMode and int(tax_num) not in self.tax_ids:
continue
if gene_num == '-' or pubmed_num == '-':
continue
line_counter += 1
gene_id = ':'.join(('NCBIGene', gene_num))
pubmed_id = ':'.join(('PMID', pubmed_num))
if self.class_or_indiv.get(gene_id) == 'C':
gu.addClassToGraph(g, gene_id, None)
else:
gu.addIndividualToGraph(g, gene_id, None)
# add the publication as a NamedIndividual
# add type publication
gu.addIndividualToGraph(g, pubmed_id, None, None)
r = Reference(
pubmed_id, Reference.ref_types['journal_article'])
r.addRefToGraph(g)
gu.addTriple(
g, pubmed_id, gu.object_properties['is_about'], gene_id)
assoc_counter += 1
if not self.testMode and \
limit is not None and line_counter > limit:
break
logger.info(
"Processed %d pub-gene associations", assoc_counter)
return