本文整理汇总了Python中dipper.utils.GraphUtils.GraphUtils.loadAllProperties方法的典型用法代码示例。如果您正苦于以下问题:Python GraphUtils.loadAllProperties方法的具体用法?Python GraphUtils.loadAllProperties怎么用?Python GraphUtils.loadAllProperties使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.utils.GraphUtils.GraphUtils
的用法示例。
在下文中一共展示了GraphUtils.loadAllProperties方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
def parse(self, limit=None):
if limit is not None:
logger.info("Only parsing first %s rows of each file", limit)
if self.version_num is None:
import os
logger.info("Figuring out version num for files")
# probe the raw directory for the WSnumber on
# the "letter.WS###" file.
# this is the only one that we keep the version number on
files = os.listdir(self.rawdir)
letter_file = next(f for f in files if re.match(r'letter', f))
vernum = re.search(r'(WS\d+)', letter_file)
self.update_wsnum_in_files(vernum.group(1))
logger.info("Parsing files...")
if self.testOnly:
self.testMode = True
if self.testMode:
g = self.testgraph
else:
g = self.graph
self.nobnodes = True # FIXME
# to hold any label for a given id
self.id_label_map = {}
# to hold the mappings between genotype and background
self.genotype_backgrounds = {}
self.extrinsic_id_to_enviro_id_hash = {}
# to hold the genes variant due to a seq alt
self.variant_loci_genes = {}
# to hold the parts of an environment
self.environment_hash = {}
self.wildtype_genotypes = []
# stores the rnai_reagent to gene targets
self.rnai_gene_map = {}
self.process_gene_ids(limit)
# self.process_gene_desc(limit) #TEC imput file is mia 2016-Mar-03
self.process_allele_phenotype(limit)
self.process_rnai_phenotypes(limit)
self.process_pub_xrefs(limit)
self.process_feature_loc(limit)
self.process_disease_association(limit)
# TODO add this when when complete
# self.process_gene_interaction(limit)
logger.info("Finished parsing.")
self.load_bindings()
gu = GraphUtils(curie_map.get())
gu.loadAllProperties(g)
gu.loadObjectProperties(g, Genotype.object_properties)
logger.info("Found %d nodes in graph", len(self.graph))
logger.info("Found %d nodes in testgraph", len(self.testgraph))
return
示例2: _process_genes
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
def _process_genes(self, taxid, limit=None):
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
raw = '/'.join((self.rawdir, self.files[taxid]['file']))
line_counter = 0
logger.info("Processing Ensembl genes for tax %s", taxid)
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t')
for row in filereader:
if len(row) < 4:
logger.error("Data error for file %s", raw)
return
(ensembl_gene_id, external_gene_name, description,
gene_biotype, entrezgene) = row[0:5]
# in the case of human genes, we also get the hgnc id,
# and is the last col
if taxid == '9606':
hgnc_id = row[5]
else:
hgnc_id = None
if self.testMode and entrezgene != '' \
and int(entrezgene) not in self.gene_ids:
continue
line_counter += 1
gene_id = 'ENSEMBL:'+ensembl_gene_id
if description == '':
description = None
gene_type_id = self._get_gene_type(gene_biotype)
gene_type_id = None
gu.addClassToGraph(
g, gene_id, external_gene_name, gene_type_id, description)
if entrezgene != '':
gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene)
if hgnc_id is not None and hgnc_id != '':
gu.addEquivalentClass(g, gene_id, hgnc_id)
geno.addTaxon('NCBITaxon:'+taxid, gene_id)
if not self.testMode \
and limit is not None and line_counter > limit:
break
gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
return
示例3: _process_orthologs
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
def _process_orthologs(self, raw, limit=None):
"""
This method maps orthologs for a species to the KEGG orthology classes.
Triples created:
<gene_id> is a class
<orthology_class_id> is a class
<assoc_id> has subject <gene_id>
<assoc_id> has object <orthology_class_id>
:param limit:
:return:
"""
logger.info("Processing orthologs")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
gu = GraphUtils(curie_map.get())
gu.loadAllProperties(g)
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(gene_id, orthology_class_id) = row
orthology_class_id = 'KEGG:'+orthology_class_id.strip()
gene_id = 'KEGG:'+gene_id.strip()
# note that the panther_id references a group of orthologs,
# and is not 1:1 with the rest
# add the KO id as a gene-family grouping class
OrthologyAssoc(
self.name, gene_id, None).add_gene_family_to_graph(
g, orthology_class_id)
# add gene and orthology class to graph;
# assume labels will be taken care of elsewhere
gu.addClassToGraph(g, gene_id, None)
gu.addClassToGraph(g, orthology_class_id, None)
if not self.testMode and \
limit is not None and line_counter > limit:
break
logger.info("Done with orthologs")
return
示例4: parse
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
def parse(self, limit=None):
"""
MPD data is delivered in four separate csv files and one xml file,
which we process iteratively and write out as
one large graph.
:param limit:
:return:
"""
if limit is not None:
logger.info("Only parsing first %s rows fo each file", str(limit))
logger.info("Parsing files...")
if self.testOnly:
self.testMode = True
g = self.testgraph
self.geno = Genotype(self.testgraph)
else:
g = self.graph
self._process_straininfo(limit)
# the following will provide us the hash-lookups
# These must be processed in a specific order
# mapping between assays and ontology terms
self._process_ontology_mappings_file(limit)
# this is the metadata about the measurements
self._process_measurements_file(limit)
# get all the measurements per strain
self._process_strainmeans_file(limit)
# The following will use the hash populated above
# to lookup the ids when filling in the graph
self._fill_provenance_graph(limit)
logger.info("Finished parsing.")
self.load_bindings()
gu = GraphUtils(curie_map.get())
gu.loadAllProperties(g)
gu.loadProperties(g, G2PAssoc.object_properties, GraphUtils.OBJPROP)
gu.loadProperties(g, G2PAssoc.datatype_properties, GraphUtils.OBJPROP)
gu.loadProperties(
g, G2PAssoc.annotation_properties, GraphUtils.ANNOTPROP)
logger.info("Found %d nodes", len(self.graph))
return
示例5: parse
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
def parse(self, limit=None):
if limit is not None:
logger.info("Only parsing first %s rows of each file", limit)
logger.info("Parsing files...")
if self.testOnly:
self.testMode = True
if self.testMode:
g = self.testgraph
else:
g = self.graph
self.nobnodes = True # FIXME
# build the id map for mapping uniprot ids to genes
uniprot_entrez_id_map = self.get_uniprot_entrez_id_map()
for s in self.files:
if s in ['go-references', 'id-map']:
continue
if not self.testMode and int(s) not in self.tax_ids:
continue
file = '/'.join((self.rawdir, self.files.get(s)['file']))
self.process_gaf(file, limit, uniprot_entrez_id_map)
logger.info("Finished parsing.")
self.load_bindings()
gu = GraphUtils(curie_map.get())
gu.loadAllProperties(g)
gu.loadObjectProperties(g, Genotype.object_properties)
logger.info("Found %d nodes in graph", len(self.graph))
logger.info("Found %d nodes in testgraph", len(self.testgraph))
return
示例6: _process_all
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
#.........这里部分代码省略.........
if 'includedTitles' in titles:
other_labels += self._get_alt_labels(titles['includedTitles'])
# add synonyms of alternate labels
# preferredTitle": "PFEIFFER SYNDROME",
# "alternativeTitles": "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME",
# "includedTitles": "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED"
# remove the abbreviation (comes after the ;) from the preferredTitle, and add it as a synonym
abbrev = None
if len(re.split(';', label)) > 1:
abbrev = (re.split(';', label)[1].strip())
newlabel = self._cleanup_label(label)
description = self._get_description(e['entry'])
omimid = 'OMIM:'+str(omimnum)
if e['entry']['status'] == 'removed':
gu.addDeprecatedClass(g, omimid)
else:
omimtype = self._get_omimtype(e['entry'])
# this uses our cleaned-up label
gu.addClassToGraph(g, omimid, newlabel, omimtype)
# add the original OMIM label as a synonym
gu.addSynonym(g, omimid, label)
# add the alternate labels and includes as synonyms
for l in other_labels:
gu.addSynonym(g, omimid, l)
# for OMIM, we're adding the description as a definition
gu.addDefinition(g, omimid, description)
if abbrev is not None:
gu.addSynonym(g, omimid, abbrev)
# if this is a genetic locus (but not sequenced) then add the chrom loc info
if omimtype == Genotype.genoparts['biological_region']:
if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']:
genemap = e['entry']['geneMap']
if 'cytoLocation' in genemap:
cytoloc = genemap['cytoLocation']
# parse the cytoloc. add this omim thing as a subsequence of the cytofeature
# 18p11.3-p11.2
# for now, just take the first one
# FIXME add the other end of the range, but not sure how to do that
# not sure if saying subsequence of feature is the right relationship
cytoloc = cytoloc.split('-')[0]
f = Feature(omimid, None, None)
if 'chromosome' in genemap:
chrom = makeChromID(str(genemap['chromosome']), tax_num, 'CHR')
geno.addChromosomeClass(str(genemap['chromosome']), tax_id, tax_label)
loc = makeChromID(cytoloc, tax_num, 'CHR')
gu.addClassToGraph(g, loc, cytoloc) # this is the chr band
f.addSubsequenceOfFeature(g, loc)
f.addFeatureToGraph(g)
pass
# check if moved, if so, make it deprecated and replaced/consider class to the other thing(s)
# some entries have been moved to multiple other entries and use the joining raw word "and"
# 612479 is movedto: "603075 and 603029" OR
# others use a comma-delimited list, like:
# 610402 is movedto: "609122,300870"
if e['entry']['status'] == 'moved':
if re.search('and', str(e['entry']['movedTo'])):
# split the movedTo entry on 'and'
newids = re.split('and', str(e['entry']['movedTo']))
elif len(str(e['entry']['movedTo']).split(',')) > 0:
# split on the comma
newids = str(e['entry']['movedTo']).split(',')
else:
# make a list of one
newids = [str(e['entry']['movedTo'])]
# cleanup whitespace and add OMIM prefix to numeric portion
fixedids = []
for i in newids:
fixedids.append('OMIM:'+i.strip())
gu.addDeprecatedClass(g, omimid, fixedids)
self._get_phenotypicseries_parents(e['entry'], g)
self._get_mappedids(e['entry'], g)
self._get_pubs(e['entry'], g)
self._get_process_allelic_variants(e['entry'], g)
### end iterating over batch of entries
# can't have more than 4 req per sec,
# so wait the remaining time, if necessary
dt = datetime.now() - request_time
rem = 0.25 - dt.total_seconds()
if rem > 0:
logger.info("waiting %d sec", rem)
time.sleep(rem/1000)
gu.loadAllProperties(g)
return
示例7: process_catalog
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
def process_catalog(self, limit=None):
"""
:param limit:
:return:
"""
raw = '/'.join((self.rawdir, self.files['catalog']['file']))
logger.info("Processing Data from %s", raw)
gu = GraphUtils(curie_map.get())
if self.testMode: # set the graph to build
g = self.testgraph
else:
g = self.graph
line_counter = 0
geno = Genotype(g)
gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
tax_id = 'NCBITaxon:9606' # hardcode
genome_version = 'GRCh38' # hardcode
# build a hashmap of genomic location to identifiers,
# to try to get the equivalences
loc_to_id_hash = {}
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
next(filereader, None) # skip the header row
for row in filereader:
if not row:
pass
else:
line_counter += 1
(date_added_to_catalog, pubmed_num, first_author, pub_date,
journal, link, study_name, disease_or_trait,
initial_sample_description, replicate_sample_description,
region, chrom_num, chrom_pos, reported_gene_nums,
mapped_gene, upstream_gene_num, downstream_gene_num,
snp_gene_nums, upstream_gene_distance,
downstream_gene_distance, strongest_snp_risk_allele, snps,
merged, snp_id_current, context, intergenic_flag,
risk_allele_frequency, pvalue, pvalue_mlog, pvalue_text,
or_or_beta, confidence_interval_95,
platform_with_snps_passing_qc, cnv_flag, mapped_trait,
mapped_trait_uri) = row
intersect = \
list(set([str(i) for i in self.test_ids['gene']]) &
set(re.split(r',', snp_gene_nums)))
# skip if no matches found in test set
if self.testMode and len(intersect) == 0:
continue
# 06-May-2015 25917933 Zai CC 20-Nov-2014 J Psychiatr Res http://europepmc.org/abstract/MED/25917933
# A genome-wide association study of suicide severity scores in bipolar disorder.
# Suicide in bipolar disorder
# 959 European ancestry individuals NA
# 10p11.22 10 32704340 C10orf68, CCDC7, ITGB1 CCDC7
# rs7079041-A rs7079041 0 7079041 intron 0 2E-6 5.698970
if chrom_num != '' and chrom_pos != '':
loc = 'chr'+str(chrom_num)+':'+str(chrom_pos)
if loc not in loc_to_id_hash:
loc_to_id_hash[loc] = set()
else:
loc = None
if re.search(r' x ', strongest_snp_risk_allele) \
or re.search(r',', strongest_snp_risk_allele):
# TODO deal with haplotypes
logger.warning(
"We can't deal with haplotypes yet: %s",
strongest_snp_risk_allele)
continue
elif re.match(r'rs', strongest_snp_risk_allele):
rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip()
# remove the alteration
elif re.match(r'kgp', strongest_snp_risk_allele):
# FIXME this isn't correct
rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip()
# http://www.1000genomes.org/faq/what-are-kgp-identifiers
# for some information
# They were created by Illumina for their genotyping
# platform before some variants identified during the
# pilot phase of the project had been assigned
# rs numbers.
elif re.match(r'chr', strongest_snp_risk_allele):
# like: chr10:106180121-G
rs_id = ':gwas-' + \
re.sub(
r':', '-', strongest_snp_risk_allele.strip())
elif strongest_snp_risk_allele.strip() == '':
# logger.debug(
# "No strongest SNP risk allele for %s:\n%s",
# pubmed_num, str(row))
# FIXME still consider adding in the EFO terms
# for what the study measured?
#.........这里部分代码省略.........
示例8: _process_diseasegene
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
#.........这里部分代码省略.........
# get the element name and id
# id = elem.get('id') # some internal identifier
disorder_num = elem.find("OrphaNumber").text
disorder_id = "Orphanet:" + str(disorder_num)
if self.testMode and disorder_id not in config.get_config()["test_ids"]["disease"]:
continue
disorder_label = elem.find("Name").text
# make a hash of internal gene id to type for later lookup
gene_iid_to_type = {}
gene_list = elem.find("GeneList")
for gene in gene_list.findall("Gene"):
gene_iid = gene.get("id")
gene_type = gene.find("GeneType").get("id")
gene_iid_to_type[gene_iid] = gene_type
gu.addClassToGraph(g, disorder_id, disorder_label) # assuming that these are in the ontology
assoc_list = elem.find("DisorderGeneAssociationList")
for a in assoc_list.findall("DisorderGeneAssociation"):
gene_iid = a.find(".//Gene").get("id")
gene_name = a.find(".//Gene/Name").text
gene_symbol = a.find(".//Gene/Symbol").text
gene_num = a.find("./Gene/OrphaNumber").text
gene_id = "Orphanet:" + str(gene_num)
gene_type_id = self._map_gene_type_id(gene_iid_to_type[gene_iid])
gu.addClassToGraph(g, gene_id, gene_symbol, gene_type_id, gene_name)
syn_list = a.find("./Gene/SynonymList")
if int(syn_list.get("count")) > 0:
for s in syn_list.findall("./Synonym"):
gu.addSynonym(g, gene_id, s.text)
dgtype = a.find("DisorderGeneAssociationType").get("id")
rel_id = self._map_rel_id(dgtype)
dg_label = a.find("./DisorderGeneAssociationType/Name").text
if rel_id is None:
logger.warn(
"Cannot map association type (%s) to RO for association (%s | %s). Skipping.",
dg_label,
disorder_label,
gene_symbol,
)
continue
alt_locus_id = "_" + gene_num + "-" + disorder_num + "VL"
alt_label = " ".join(
("some variant of", gene_symbol.strip(), "that is a", dg_label.lower(), disorder_label)
)
if self.nobnodes:
alt_locus_id = ":" + alt_locus_id
gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts["variant_locus"])
geno.addAlleleOfGene(alt_locus_id, gene_id)
# consider typing the gain/loss-of-function variants like:
# http://sequenceontology.org/browser/current_svn/term/SO:0002054
# http://sequenceontology.org/browser/current_svn/term/SO:0002053
# use "assessed" status to issue an evidence code
# FIXME I think that these codes are sub-optimal
status_code = a.find("DisorderGeneAssociationStatus").get("id")
eco_id = "ECO:0000323" # imported automatically asserted information used in automatic assertion
if status_code == "17991": # Assessed # TODO are these internal ids stable between releases?
eco_id = "ECO:0000322" # imported manually asserted information used in automatic assertion
# Non-traceable author statement ECO_0000034
# imported information in automatic assertion ECO_0000313
assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id)
assoc.add_evidence(eco_id)
assoc.add_association_to_graph(g)
rlist = a.find("./Gene/ExternalReferenceList")
eqid = None
for r in rlist.findall("ExternalReference"):
if r.find("Source").text == "Ensembl":
eqid = "ENSEMBL:" + r.find("Reference").text
elif r.find("Source").text == "HGNC":
eqid = "HGNC:" + r.find("Reference").text
elif r.find("Source").text == "OMIM":
eqid = "OMIM:" + r.find("Reference").text
else:
pass # skip the others for now
if eqid is not None:
gu.addClassToGraph(g, eqid, None)
gu.addEquivalentClass(g, gene_id, eqid)
pass
elem.clear() # discard the element
if self.testMode and limit is not None and line_counter > limit:
return
gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
gu.loadAllProperties(g)
return
示例9: _get_gene_info
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
#.........这里部分代码省略.........
# TODO might have to figure out if things aren't genes, and make them individuals
gu.addClassToGraph(g, gene_id, label, gene_type_id, desc)
# we have to do special things here for genes, because they're classes not individuals
# f = Feature(gene_id,label,gene_type_id,desc)
if name != '-':
gu.addSynonym(g, gene_id, name)
if synonyms.strip() != '-':
for s in synonyms.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
if other_designations.strip() != '-':
for s in other_designations.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
# deal with the xrefs
# MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
if xrefs.strip() != '-':
for r in xrefs.strip().split('|'):
fixedr = self._cleanup_id(r)
if fixedr is not None and fixedr.strip() != '':
if re.match('HPRD', fixedr):
# proteins are not == genes.
gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr)
else:
# skip some of these for now
if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']:
gu.addEquivalentClass(g, gene_id, fixedr)
# edge cases of id | symbol | chr | map_loc:
# 263 AMD1P2 X|Y with Xq28 and Yq12
# 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR
# 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies
# 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR
# 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility
# 101928066 LOC101928066 1|Un - # unlocated scaffold
# 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 2C3
# 11548 Adra1b 11 11 B1.1|11 25.81 cM # mouse --> 11B1.1
# 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse
# 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse
# 323212 wu:fb92e12 19|20 - # fish
# 323368 ints10 6|18 - # fish
# 323666 wu:fc06e02 11|23 - # fish
# feel that the chr placement can't be trusted in this table when there is > 1 listed
# with the exception of human X|Y, i will only take those that align to one chr
# FIXME remove the chr mapping below when we pull in the genomic coords
if str(chr) != '-' and str(chr) != '':
if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']:
# this means that there's uncertainty in the mapping. skip it
# TODO we'll need to figure out how to deal with >1 loc mapping
logger.info('%s is non-uniquely mapped to %s. Skipping for now.', gene_id, str(chr))
continue
# X|Y Xp22.33;Yp11.3
# if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
# print('odd chr=',str(chr))
if str(chr) == 'X; Y':
chr = 'X|Y' # rewrite the PAR regions for processing
# do this in a loop to allow PAR regions like X|Y
for c in re.split('\|',str(chr)) :
geno.addChromosomeClass(c, tax_id, None) # assume that the chromosome label will get added elsewhere
mychrom = makeChromID(c, tax_num, 'CHR')
mychrom_syn = makeChromLabel(c, tax_num) # temporarily use the taxnum for the disambiguating label
gu.addSynonym(g, mychrom, mychrom_syn)
band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc)
if band_match is not None and len(band_match.groups()) > 0:
# if tax_num != '9606':
# continue
# this matches the regular kind of chrs, so make that kind of band
# not sure why this matches? chrX|Y or 10090chr12|Un"
# TODO we probably need a different regex per organism
# the maploc_id already has the numeric chromosome in it, strip it first
bid = re.sub('^'+c, '', map_loc)
maploc_id = makeChromID(c+bid, tax_num, 'CHR') # the generic location (no coordinates)
# print(map_loc,'-->',bid,'-->',maploc_id)
band = Feature(maploc_id, None, None) # Assume it's type will be added elsewhere
band.addFeatureToGraph(g)
# add the band as the containing feature
gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id)
else:
# TODO handle these cases
# examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24,
## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1, 12cen-q21, 22q13.3|22q13.3
logger.debug('not regular band pattern for %s: %s', gene_id, map_loc)
# add the gene as a subsequence of the chromosome
gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom)
geno.addTaxon(tax_id, gene_id)
if not self.testMode and limit is not None and line_counter > limit:
break
gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
return
示例10: _process_data
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
def _process_data(self, raw, limit=None):
"""
This function will process the data files from Coriell.
We make the assumption that any alleles listed are variants
(alternates to w.t.)
Triples: (examples)
:NIGMSrepository a CLO_0000008 #repository
label : NIGMS Human Genetic Cell Repository
foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8
line_id a CL_0000057, #fibroblast line
derives_from patient_id
part_of :NIGMSrepository
RO:model_of OMIM:disease_id
patient id a foaf:person,
label: "fibroblast from patient 12345 with disease X"
member_of family_id #what is the right thing here?
SIO:race EFO:caucasian #subclass of EFO:0001799
in_taxon NCBITaxon:9606
dc:description Literal(remark)
RO:has_phenotype OMIM:disease_id
GENO:has_genotype genotype_id
family_id a owl:NamedIndividual
foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"
genotype_id a intrinsic_genotype
GENO:has_alternate_part allelic_variant_id
we don't necessarily know much about the genotype,
other than the allelic variant. also there's the sex here
pub_id mentions cell_line_id
:param raw:
:param limit:
:return:
"""
logger.info("Processing Data from %s", raw)
gu = GraphUtils(curie_map.get())
if self.testMode: # set the graph to build
g = self.testgraph
else:
g = self.graph
line_counter = 0
geno = Genotype(g)
du = DipperUtil()
gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
next(filereader, None) # skip the header row
for row in filereader:
if not row:
pass
else:
line_counter += 1
(catalog_id, description, omim_number, sample_type,
cell_line_available, dna_in_stock, dna_ref, gender, age,
race, ethnicity, affected, karyotype, relprob, mutation,
gene, family_id, collection, url, cat_remark, pubmed_ids,
family_member, variant_id, dbsnp_id, species) = row
# example:
# GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
# parent,,,39,NIGMS Human Genetic Cell Repository,
# http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
# 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
# 2,,18343,Homo sapiens
if self.testMode and catalog_id not in self.test_lines:
# skip rows not in our test lines, when in test mode
continue
# ########### BUILD REQUIRED VARIABLES ###########
# Make the cell line ID
cell_line_id = 'Coriell:'+catalog_id.strip()
# Map the cell/sample type
cell_type = self._map_cell_type(sample_type)
# Make a cell line label
line_label = \
collection.partition(' ')[0]+'-'+catalog_id.strip()
# Map the repository/collection
repository = self._map_collection(collection)
# patients are uniquely identified by one of:
# dbsnp id (which is == an individual haplotype)
# family id + family member (if present) OR
# probands are usually family member zero
#.........这里部分代码省略.........
示例11: Monochrom
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
#.........这里部分代码省略.........
# then the subbands are subsequences of the full band
# add the subsequence stuff as restrictions
if i < len(parents) - 1:
pid = cclassid+parents[i+1] # the instance
self.gu.addOWLPropertyClassRestriction(
self.graph, pclassid,
Feature.object_properties['is_subsequence_of'],
pid)
self.gu.addOWLPropertyClassRestriction(
self.graph, pid,
Feature.object_properties['has_subsequence'],
pclassid)
else:
# add the last one (p or q usually)
# as attached to the chromosome
self.gu.addOWLPropertyClassRestriction(
self.graph, pclassid,
Feature.object_properties['is_subsequence_of'],
cclassid)
self.gu.addOWLPropertyClassRestriction(
self.graph, cclassid,
Feature.object_properties['has_subsequence'],
pclassid)
# connect the band here to the first one in the parent list
if len(parents) > 0:
self.gu.addOWLPropertyClassRestriction(
self.graph, maplocclass_id,
Feature.object_properties['is_subsequence_of'],
cclassid+parents[0])
self.gu.addOWLPropertyClassRestriction(
self.graph, cclassid+parents[0],
Feature.object_properties['has_subsequence'],
maplocclass_id)
if limit is not None and line_counter > limit:
break
self.gu.loadAllProperties(self.graph)
# TODO figure out the staining intensities for the encompassing bands
return
def make_parent_bands(self, band, child_bands):
"""
this will determine the grouping bands that it belongs to, recursively
13q21.31 ==> 13, 13q, 13q2, 13q21, 13q21.3, 13q21.31
:param band:
:param child_bands:
:return:
"""
m = re.match(r'([pq][A-H\d]+(?:\.\d+)?)', band)
if len(band) > 0:
if m:
p = str(band[0:len(band)-1])
p = re.sub(r'\.$', '', p)
if p is not None:
child_bands.add(p)
self.make_parent_bands(p, child_bands)
else:
child_bands = set()
return child_bands
def map_type_of_region(self, regiontype):
"""
Note that "stalk" refers to the short arm of acrocentric chromosomes
chr13,14,15,21,22 for human.
:param regiontype:
:return:
"""
so_id = Feature.types['chromosome_part']
if regiontype in self.region_type_map.keys():
so_id = self.region_type_map.get(regiontype)
else:
logger.warning(
"Unmapped code %s. Defaulting to chr_part 'SO:0000830'.",
regiontype)
return so_id
def _check_tax_ids(self):
for taxon in self.tax_ids:
if str(taxon) not in self.files:
raise Exception("Taxon " + str(taxon) +
" not supported by source Monochrom")
def getTestSuite(self):
# import unittest
# from tests.test_ucscbands import UCSCBandsTestCase
test_suite = None
# test_suite = \
# unittest.TestLoader().loadTestsFromTestCase(UCSCBandsTestCase)
return test_suite
示例12: _process_phenotype_data
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
#.........这里部分代码省略.........
geno.genoparts['variant_locus'])
vl_set.add(vl_id)
if len(variants) == 1 and len(genes) == 1:
for gene in genes:
geno.addAlleleOfGene(vl_id, gene)
else:
geno.addAllele(vl_id, vl_symbol)
else: # len(vars) == 0
# it's just anonymous variants in some gene
for gene in genes:
vl_id = '_'+gene+'-VL'
vl_id = re.sub(r':', '', vl_id)
if self.nobnodes:
vl_id = ':'+vl_id
vl_symbol = self.id_label_hash[gene]+'<?>'
self.id_label_hash[vl_id] = vl_symbol
geno.addAllele(vl_id, vl_symbol,
geno.genoparts['variant_locus'])
geno.addGene(gene, self.id_label_hash[gene])
geno.addAlleleOfGene(vl_id, gene)
vl_set.add(vl_id)
# make the vslcs
vl_list = sorted(vl_set)
vslc_list = []
for vl in vl_list:
# for unknown zygosity
vslc_id = '_'+re.sub(r'^_', '', vl)+'U'
vslc_id = re.sub(r':', '', vslc_id)
if self.nobnodes:
vslc_id = ':' + vslc_id
vslc_label = self.id_label_hash[vl] + '/?'
self.id_label_hash[vslc_id] = vslc_label
vslc_list.append(vslc_id)
geno.addPartsToVSLC(
vslc_id, vl, None, geno.zygosity['indeterminate'],
geno.object_properties['has_alternate_part'], None)
gu.addIndividualToGraph(
g, vslc_id, vslc_label,
geno.genoparts['variant_single_locus_complement'])
if len(vslc_list) > 0:
if len(vslc_list) > 1:
gvc_id = '-'.join(vslc_list)
gvc_id = re.sub(r':', '', gvc_id)
if self.nobnodes:
gvc_id = ':'+gvc_id
gvc_label = \
'; '.join(self.id_label_hash[v] for v in vslc_list)
gu.addIndividualToGraph(
g, gvc_id, gvc_label,
geno.genoparts['genomic_variation_complement'])
for vslc_id in vslc_list:
geno.addVSLCtoParent(vslc_id, gvc_id)
else:
# the GVC == VSLC, so don't have to make an extra piece
gvc_id = vslc_list.pop()
gvc_label = self.id_label_hash[gvc_id]
genotype_label = gvc_label + ' [n.s.]'
bkgd_id = \
'_' + re.sub(r':', '', '-'.join(
(geno.genoparts['unspecified_genomic_background'],
s)))
genotype_id = '-'.join((gvc_id, bkgd_id))
if self.nobnodes:
bkgd_id = ':'+bkgd_id
geno.addTaxon(mouse_taxon, bkgd_id)
geno.addGenomicBackground(
bkgd_id, 'unspecified ('+s+')',
geno.genoparts['unspecified_genomic_background'],
"A placeholder for the " +
"unspecified genetic background for "+s)
geno.addGenomicBackgroundToGenotype(
bkgd_id, genotype_id,
geno.genoparts['unspecified_genomic_background'])
geno.addParts(
gvc_id, genotype_id,
geno.object_properties['has_alternate_part'])
geno.addGenotype(genotype_id, genotype_label)
gu.addTriple(
g, s, geno.object_properties['has_genotype'],
genotype_id)
else:
# logger.debug(
# "Strain %s is not making a proper genotype.", s)
pass
gu.loadProperties(
g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
gu.loadProperties(
g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
gu.loadProperties(
g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
gu.loadAllProperties(g)
logger.warning(
"The following gene symbols did not list identifiers: %s",
str(sorted(list(genes_with_no_ids))))
return
示例13: _process_kegg_disease2gene
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
def _process_kegg_disease2gene(self, limit=None):
"""
This method creates an association between diseases and their associated genes.
We are being conservative here, and only processing those diseases for which there
is no mapping to OMIM.
Triples created:
<alternate_locus> is an Individual
<alternate_locus> has type <variant_locus>
<alternate_locus> is an allele of <gene_id>
<assoc_id> has subject <disease_id>
<assoc_id> has object <gene_id>
:param limit:
:return:
"""
logger.info("Processing KEGG disease to gene")
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
geno = Genotype(g)
gu = GraphUtils(curie_map.get())
rel = gu.object_properties['is_marker_for']
gu.loadAllProperties(g)
noomimset = set()
raw = '/'.join((self.rawdir, self.files['disease_gene']['file']))
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(gene_id, disease_id) = row
if self.testMode and gene_id not in self.test_ids['genes']:
continue
gene_id = 'KEGG-'+gene_id.strip()
disease_id = 'KEGG-'+disease_id.strip()
# only add diseases for which there is no omim id and not a grouping class
if disease_id not in self.kegg_disease_hash:
# add as a class
disease_label = None
if disease_id in self.label_hash:
disease_label = self.label_hash[disease_id]
if re.search('includ', str(disease_label)):
# they use 'including' when it's a grouping class
logger.info("Skipping this association because it's a grouping class: %s", disease_label)
continue
gu.addClassToGraph(g, disease_id, disease_label, 'DOID:4') # type this disease_id as a disease
noomimset.add(disease_id)
alt_locus_id = self._make_variant_locus_id(gene_id, disease_id)
alt_label = self.label_hash[alt_locus_id]
gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus'])
geno.addAlleleOfGene(alt_locus_id, gene_id)
# Add the disease to gene relationship.
assoc = G2PAssoc(self.name, alt_locus_id, disease_id, rel)
assoc.load_all_properties(g)
assoc.add_association_to_graph(g)
if (not self.testMode) and (limit is not None and line_counter > limit):
break
logger.info("Done with KEGG disease to gene")
logger.info("Found %d diseases with no omim id", len(noomimset))
return
示例14: _process_diseasegene
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
#.........这里部分代码省略.........
gene_iid_to_type = {}
gene_list = elem.find('GeneList')
for gene in gene_list.findall('Gene'):
gene_iid = gene.get('id')
gene_type = gene.find('GeneType').get('id')
gene_iid_to_type[gene_iid] = gene_type
# assuming that these are in the ontology
gu.addClassToGraph(g, disorder_id, disorder_label)
assoc_list = elem.find('DisorderGeneAssociationList')
for a in assoc_list.findall('DisorderGeneAssociation'):
gene_iid = a.find('.//Gene').get('id')
gene_name = a.find('.//Gene/Name').text
gene_symbol = a.find('.//Gene/Symbol').text
gene_num = a.find('./Gene/OrphaNumber').text
gene_id = 'Orphanet:'+str(gene_num)
gene_type_id = \
self._map_gene_type_id(gene_iid_to_type[gene_iid])
gu.addClassToGraph(
g, gene_id, gene_symbol, gene_type_id, gene_name)
syn_list = a.find('./Gene/SynonymList')
if int(syn_list.get('count')) > 0:
for s in syn_list.findall('./Synonym'):
gu.addSynonym(g, gene_id, s.text)
dgtype = a.find('DisorderGeneAssociationType').get('id')
rel_id = self._map_rel_id(dgtype)
dg_label = \
a.find('./DisorderGeneAssociationType/Name').text
if rel_id is None:
logger.warning(
"Cannot map association type (%s) to RO " +
"for association (%s | %s). Skipping.",
dg_label, disorder_label, gene_symbol)
continue
alt_locus_id = '_'+gene_num+'-'+disorder_num+'VL'
alt_label = \
' '.join(('some variant of', gene_symbol.strip(),
'that is a', dg_label.lower(),
disorder_label))
if self.nobnodes:
alt_locus_id = ':'+alt_locus_id
gu.addIndividualToGraph(g, alt_locus_id, alt_label,
geno.genoparts['variant_locus'])
geno.addAlleleOfGene(alt_locus_id, gene_id)
# consider typing the gain/loss-of-function variants like:
# http://sequenceontology.org/browser/current_svn/term/SO:0002054
# http://sequenceontology.org/browser/current_svn/term/SO:0002053
# use "assessed" status to issue an evidence code
# FIXME I think that these codes are sub-optimal
status_code = \
a.find('DisorderGeneAssociationStatus').get('id')
# imported automatically asserted information
# used in automatic assertion
eco_id = 'ECO:0000323'
# Assessed
# TODO are these internal ids stable between releases?
if status_code == '17991':
# imported manually asserted information
# used in automatic assertion
eco_id = 'ECO:0000322'
# Non-traceable author statement ECO_0000034
# imported information in automatic assertion ECO_0000313
assoc = G2PAssoc(self.name, alt_locus_id,
disorder_id, rel_id)
assoc.add_evidence(eco_id)
assoc.add_association_to_graph(g)
rlist = a.find('./Gene/ExternalReferenceList')
eqid = None
for r in rlist.findall('ExternalReference'):
if r.find('Source').text == 'Ensembl':
eqid = 'ENSEMBL:'+r.find('Reference').text
elif r.find('Source').text == 'HGNC':
eqid = 'HGNC:'+r.find('Reference').text
elif r.find('Source').text == 'OMIM':
eqid = 'OMIM:'+r.find('Reference').text
else:
pass # skip the others for now
if eqid is not None:
gu.addClassToGraph(g, eqid, None)
gu.addEquivalentClass(g, gene_id, eqid)
elem.clear() # discard the element
if self.testMode and limit is not None and line_counter > limit:
return
gu.loadProperties(
g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
gu.loadAllProperties(g)
return
示例15: _process_data
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadAllProperties [as 别名]
def _process_data(self, raw, limit=None):
logger.info("Processing Data from %s", raw)
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
line_counter = 0
gu.loadAllProperties(g)
gu.loadObjectProperties(g, geno.object_properties)
# Add the taxon as a class
taxon_id = 'NCBITaxon:10090' # map to Mus musculus
gu.addClassToGraph(g, taxon_id, None)
# with open(raw, 'r', encoding="utf8") as csvfile:
with gzip.open(raw, 'rt') as csvfile:
filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
next(filereader, None) # skip the header row
for row in filereader:
line_counter += 1
(marker_accession_id, marker_symbol, phenotyping_center,
colony, sex, zygosity, allele_accession_id, allele_symbol,
allele_name, strain_accession_id, strain_name, project_name,
project_fullname, pipeline_name, pipeline_stable_id,
procedure_stable_id, procedure_name, parameter_stable_id,
parameter_name, top_level_mp_term_id, top_level_mp_term_name,
mp_term_id, mp_term_name, p_value, percentage_change,
effect_size, statistical_method, resource_name) = row
if self.testMode and marker_accession_id not in self.test_ids:
continue
# ##### cleanup some of the identifiers ######
zygosity_id = self._map_zygosity(zygosity)
# colony ids sometimes have <> in them, spaces,
# or other non-alphanumerics and break our system;
# replace these with underscores
colony_id = '_'+re.sub(r'\W+', '_', colony)
if self.nobnodes:
colony_id = ':'+colony_id
if not re.match(r'MGI', allele_accession_id):
allele_accession_id = \
'_IMPC-'+re.sub(r':', '', allele_accession_id)
if self.nobnodes:
allele_accession_id = ':'+allele_accession_id
if re.search(r'EUROCURATE', strain_accession_id):
# the eurocurate links don't resolve at IMPC
strain_accession_id = '_'+strain_accession_id
if self.nobnodes:
strain_accession_id = ':'+strain_accession_id
elif not re.match(r'MGI', strain_accession_id):
logger.info(
"Found a strange strain accession...%s",
strain_accession_id)
strain_accession_id = 'IMPC:'+strain_accession_id
######################
# first, add the marker and variant to the graph as with MGI,
# the allele is the variant locus. IF the marker is not known,
# we will call it a sequence alteration. otherwise,
# we will create a BNode for the sequence alteration.
sequence_alteration_id = variant_locus_id = None
variant_locus_name = sequence_alteration_name = None
# extract out what's within the <> to get the symbol
if re.match(r'.*<.*>', allele_symbol):
sequence_alteration_name = \
re.match(r'.*<(.*)>', allele_symbol).group(1)
else:
sequence_alteration_name = allele_symbol
if marker_accession_id is not None and \
marker_accession_id == '':
logger.warning(
"Marker unspecified on row %d", line_counter)
marker_accession_id = None
if marker_accession_id is not None:
variant_locus_id = allele_accession_id
variant_locus_name = allele_symbol
variant_locus_type = geno.genoparts['variant_locus']
geno.addGene(marker_accession_id, marker_symbol,
geno.genoparts['gene'])
geno.addAllele(variant_locus_id, variant_locus_name,
variant_locus_type, None)
geno.addAlleleOfGene(variant_locus_id, marker_accession_id)
sequence_alteration_id = \
'_seqalt'+re.sub(r':', '', allele_accession_id)
if self.nobnodes:
sequence_alteration_id = ':'+sequence_alteration_id
geno.addSequenceAlterationToVariantLocus(
sequence_alteration_id, variant_locus_id)
#.........这里部分代码省略.........