本文整理汇总了Python中dipper.utils.GraphUtils.GraphUtils.loadProperties方法的典型用法代码示例。如果您正苦于以下问题:Python GraphUtils.loadProperties方法的具体用法?Python GraphUtils.loadProperties怎么用?Python GraphUtils.loadProperties使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.utils.GraphUtils.GraphUtils
的用法示例。
在下文中一共展示了GraphUtils.loadProperties方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _process_genes
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
def _process_genes(self, taxid, limit=None):
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
raw = '/'.join((self.rawdir, self.files[taxid]['file']))
line_counter = 0
logger.info("Processing Ensembl genes for tax %s", taxid)
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t')
for row in filereader:
if len(row) < 4:
logger.error("Data error for file %s", raw)
return
(ensembl_gene_id, external_gene_name, description,
gene_biotype, entrezgene) = row[0:5]
# in the case of human genes, we also get the hgnc id,
# and is the last col
if taxid == '9606':
hgnc_id = row[5]
else:
hgnc_id = None
if self.testMode and entrezgene != '' \
and int(entrezgene) not in self.gene_ids:
continue
line_counter += 1
gene_id = 'ENSEMBL:'+ensembl_gene_id
if description == '':
description = None
gene_type_id = self._get_gene_type(gene_biotype)
gene_type_id = None
gu.addClassToGraph(
g, gene_id, external_gene_name, gene_type_id, description)
if entrezgene != '':
gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene)
if hgnc_id is not None and hgnc_id != '':
gu.addEquivalentClass(g, gene_id, hgnc_id)
geno.addTaxon('NCBITaxon:'+taxid, gene_id)
if not self.testMode \
and limit is not None and line_counter > limit:
break
gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
return
示例2: parse
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
def parse(self, limit=None):
"""
MPD data is delivered in four separate csv files and one xml file,
which we process iteratively and write out as
one large graph.
:param limit:
:return:
"""
if limit is not None:
logger.info("Only parsing first %s rows fo each file", str(limit))
logger.info("Parsing files...")
if self.testOnly:
self.testMode = True
g = self.testgraph
self.geno = Genotype(self.testgraph)
else:
g = self.graph
self._process_straininfo(limit)
# the following will provide us the hash-lookups
# These must be processed in a specific order
# mapping between assays and ontology terms
self._process_ontology_mappings_file(limit)
# this is the metadata about the measurements
self._process_measurements_file(limit)
# get all the measurements per strain
self._process_strainmeans_file(limit)
# The following will use the hash populated above
# to lookup the ids when filling in the graph
self._fill_provenance_graph(limit)
logger.info("Finished parsing.")
self.load_bindings()
gu = GraphUtils(curie_map.get())
gu.loadAllProperties(g)
gu.loadProperties(g, G2PAssoc.object_properties, GraphUtils.OBJPROP)
gu.loadProperties(g, G2PAssoc.datatype_properties, GraphUtils.OBJPROP)
gu.loadProperties(
g, G2PAssoc.annotation_properties, GraphUtils.ANNOTPROP)
logger.info("Found %d nodes", len(self.graph))
return
示例3: _process_phenotype_data
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
#.........这里部分代码省略.........
geno.genoparts['variant_locus'])
vl_set.add(vl_id)
if len(variants) == 1 and len(genes) == 1:
for gene in genes:
geno.addAlleleOfGene(vl_id, gene)
else:
geno.addAllele(vl_id, vl_symbol)
else: # len(vars) == 0
# it's just anonymous variants in some gene
for gene in genes:
vl_id = '_'+gene+'-VL'
vl_id = re.sub(r':', '', vl_id)
if self.nobnodes:
vl_id = ':'+vl_id
vl_symbol = self.id_label_hash[gene]+'<?>'
self.id_label_hash[vl_id] = vl_symbol
geno.addAllele(vl_id, vl_symbol,
geno.genoparts['variant_locus'])
geno.addGene(gene, self.id_label_hash[gene])
geno.addAlleleOfGene(vl_id, gene)
vl_set.add(vl_id)
# make the vslcs
vl_list = sorted(vl_set)
vslc_list = []
for vl in vl_list:
# for unknown zygosity
vslc_id = '_'+re.sub(r'^_', '', vl)+'U'
vslc_id = re.sub(r':', '', vslc_id)
if self.nobnodes:
vslc_id = ':' + vslc_id
vslc_label = self.id_label_hash[vl] + '/?'
self.id_label_hash[vslc_id] = vslc_label
vslc_list.append(vslc_id)
geno.addPartsToVSLC(
vslc_id, vl, None, geno.zygosity['indeterminate'],
geno.object_properties['has_alternate_part'], None)
gu.addIndividualToGraph(
g, vslc_id, vslc_label,
geno.genoparts['variant_single_locus_complement'])
if len(vslc_list) > 0:
if len(vslc_list) > 1:
gvc_id = '-'.join(vslc_list)
gvc_id = re.sub(r':', '', gvc_id)
if self.nobnodes:
gvc_id = ':'+gvc_id
gvc_label = \
'; '.join(self.id_label_hash[v] for v in vslc_list)
gu.addIndividualToGraph(
g, gvc_id, gvc_label,
geno.genoparts['genomic_variation_complement'])
for vslc_id in vslc_list:
geno.addVSLCtoParent(vslc_id, gvc_id)
else:
# the GVC == VSLC, so don't have to make an extra piece
gvc_id = vslc_list.pop()
gvc_label = self.id_label_hash[gvc_id]
genotype_label = gvc_label + ' [n.s.]'
bkgd_id = \
'_' + re.sub(r':', '', '-'.join(
(geno.genoparts['unspecified_genomic_background'],
s)))
genotype_id = '-'.join((gvc_id, bkgd_id))
if self.nobnodes:
bkgd_id = ':'+bkgd_id
geno.addTaxon(mouse_taxon, bkgd_id)
geno.addGenomicBackground(
bkgd_id, 'unspecified ('+s+')',
geno.genoparts['unspecified_genomic_background'],
"A placeholder for the " +
"unspecified genetic background for "+s)
geno.addGenomicBackgroundToGenotype(
bkgd_id, genotype_id,
geno.genoparts['unspecified_genomic_background'])
geno.addParts(
gvc_id, genotype_id,
geno.object_properties['has_alternate_part'])
geno.addGenotype(genotype_id, genotype_label)
gu.addTriple(
g, s, geno.object_properties['has_genotype'],
genotype_id)
else:
# logger.debug(
# "Strain %s is not making a proper genotype.", s)
pass
gu.loadProperties(
g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
gu.loadProperties(
g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
gu.loadProperties(
g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
gu.loadAllProperties(g)
logger.warning(
"The following gene symbols did not list identifiers: %s",
str(sorted(list(genes_with_no_ids))))
return
示例4: Feature
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
#.........这里部分代码省略.........
return
def addPositionToGraph(
self, graph, reference_id, position,
position_types=None, strand=None):
"""
Add the positional information to the graph, following the faldo model.
We assume that if the strand is None,
we give it a generic "Position" only.
Triples:
my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
faldo:position Integer(numeric position)
faldo:reference reference_id
:param graph:
:param reference_id:
:param position:
:param position_types:
:param strand:
:return: Identifier of the position created
"""
iid = self._makePositionId(reference_id, position, position_types)
n = self.gu.getNode(iid)
pos = self.gu.getNode(self.properties['position'])
ref = self.gu.getNode(self.properties['reference'])
if position is not None:
graph.add((n, pos, Literal(position, datatype=XSD['integer'])))
graph.add((n, ref, self.gu.getNode(reference_id)))
if position_types is not None:
for t in position_types:
graph.add((n, RDF['type'], self.gu.getNode(t)))
s = None
if strand is not None:
s = strand
if not re.match(r'faldo', strand):
# not already mapped to faldo, so expect we need to map it
s = self._getStrandType(strand)
# else:
# s = self.types['both_strand']
if s is None and (position_types is None or position_types == []):
s = self.types['Position']
if s is not None:
graph.add((n, RDF['type'], self.gu.getNode(s)))
return iid
def addSubsequenceOfFeature(self, graph, parentid):
"""
This will add reciprocal triples like:
feature is_subsequence_of parent
parent has_subsequence feature
:param graph:
:param parentid:
:return:
"""
self.gu.addTriple(
graph, self.id, self.properties['is_subsequence_of'], parentid)
self.gu.addTriple(
graph, parentid, self.properties['has_subsequence'], self.id)
return
def addTaxonToFeature(self, graph, taxonid):
"""
Given the taxon id, this will add the following triple:
feature in_taxon taxonid
:param graph:
:param taxonid:
:return:
"""
# TEC: should taxon be set in __init__()?
self.taxon = taxonid
self.gu.addTriple(
graph, self.id, Assoc.properties['in_taxon'], self.taxon)
return
def loadAllProperties(self, graph):
prop_dict = {
Assoc(None).ANNOTPROP: self.annotation_properties,
Assoc(None).OBJECTPROP: self.object_properties,
Assoc(None).DATAPROP: self.data_properties
}
for p in prop_dict:
self.gu.loadProperties(graph, prop_dict.get(p), p)
return
def addFeatureProperty(self, graph, property_type, property):
self.gu.addTriple(graph, self.id, property_type, property)
return
def setNoBNodes(self, nobnodes):
self.nobnodes = nobnodes
return
示例5: Genotype
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
#.........这里部分代码省略.........
'feature_to_gene_relation': 'GENO:0000418'
}
annotation_properties = {
# TODO change properties with https://github.com/monarch-initiative/GENO-ontology/issues/21
'reference_nucleotide': 'GENO:reference_nucleotide', # Made up term
'reference_amino_acid': 'GENO:reference_amino_acid', # Made up term
'altered_nucleotide': 'GENO:altered_nucleotide', # Made up term
'results_in_amino_acid_change': 'GENO:results_in_amino_acid_change' # Made up term
}
zygosity = {
'homoplasmic': 'GENO:0000602',
'heterozygous': 'GENO:0000135',
'indeterminate': 'GENO:0000137',
'heteroplasmic': 'GENO:0000603',
'hemizygous-y': 'GENO:0000604',
'hemizygous-x': 'GENO:0000605',
'homozygous': 'GENO:0000136',
'hemizygous': 'GENO:0000606',
'complex_heterozygous': 'GENO:0000402',
'simple_heterozygous': 'GENO:0000458'
}
properties = object_properties.copy()
properties.update(annotation_properties)
def __init__(self, graph):
self.gu = GraphUtils(curie_map.get())
self.graph = graph
self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP)
return
def addGenotype(self, genotype_id, genotype_label, genotype_type=None, genotype_description=None):
"""
If a genotype_type is not supplied, we will default to 'intrinsic_genotype'
:param genotype_id:
:param genotype_label:
:param genotype_type:
:param genotype_description:
:return:
"""
if genotype_type is None:
genotype_type = self.genoparts['intrinsic_genotype']
self.gu.addIndividualToGraph(self.graph, genotype_id, genotype_label, genotype_type, genotype_description)
return
def addAllele(self, allele_id, allele_label, allele_type=None, allele_description=None):
"""
Make an allele object. If no allele_type is added, it will default to a geno:allele
:param allele_id: curie for allele (required)
:param allele_label: label for allele (required)
:param allele_type: id for an allele type (optional, recommended SO or GENO class)
:param allele_description: a free-text description of the allele
:return:
"""
# TODO should we accept a list of allele types?
if (allele_type is None):
allele_type = self.genoparts['allele'] #TODO is this a good idea?
self.gu.addIndividualToGraph(self.graph, allele_id, allele_label, allele_type, allele_description)
示例6: _process_diseasegene
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
#.........这里部分代码省略.........
gene_iid_to_type = {}
gene_list = elem.find('GeneList')
for gene in gene_list.findall('Gene'):
gene_iid = gene.get('id')
gene_type = gene.find('GeneType').get('id')
gene_iid_to_type[gene_iid] = gene_type
# assuming that these are in the ontology
gu.addClassToGraph(g, disorder_id, disorder_label)
assoc_list = elem.find('DisorderGeneAssociationList')
for a in assoc_list.findall('DisorderGeneAssociation'):
gene_iid = a.find('.//Gene').get('id')
gene_name = a.find('.//Gene/Name').text
gene_symbol = a.find('.//Gene/Symbol').text
gene_num = a.find('./Gene/OrphaNumber').text
gene_id = 'Orphanet:'+str(gene_num)
gene_type_id = \
self._map_gene_type_id(gene_iid_to_type[gene_iid])
gu.addClassToGraph(
g, gene_id, gene_symbol, gene_type_id, gene_name)
syn_list = a.find('./Gene/SynonymList')
if int(syn_list.get('count')) > 0:
for s in syn_list.findall('./Synonym'):
gu.addSynonym(g, gene_id, s.text)
dgtype = a.find('DisorderGeneAssociationType').get('id')
rel_id = self._map_rel_id(dgtype)
dg_label = \
a.find('./DisorderGeneAssociationType/Name').text
if rel_id is None:
logger.warning(
"Cannot map association type (%s) to RO " +
"for association (%s | %s). Skipping.",
dg_label, disorder_label, gene_symbol)
continue
alt_locus_id = '_'+gene_num+'-'+disorder_num+'VL'
alt_label = \
' '.join(('some variant of', gene_symbol.strip(),
'that is a', dg_label.lower(),
disorder_label))
if self.nobnodes:
alt_locus_id = ':'+alt_locus_id
gu.addIndividualToGraph(g, alt_locus_id, alt_label,
geno.genoparts['variant_locus'])
geno.addAlleleOfGene(alt_locus_id, gene_id)
# consider typing the gain/loss-of-function variants like:
# http://sequenceontology.org/browser/current_svn/term/SO:0002054
# http://sequenceontology.org/browser/current_svn/term/SO:0002053
# use "assessed" status to issue an evidence code
# FIXME I think that these codes are sub-optimal
status_code = \
a.find('DisorderGeneAssociationStatus').get('id')
# imported automatically asserted information
# used in automatic assertion
eco_id = 'ECO:0000323'
# Assessed
# TODO are these internal ids stable between releases?
if status_code == '17991':
# imported manually asserted information
# used in automatic assertion
eco_id = 'ECO:0000322'
# Non-traceable author statement ECO_0000034
# imported information in automatic assertion ECO_0000313
assoc = G2PAssoc(self.name, alt_locus_id,
disorder_id, rel_id)
assoc.add_evidence(eco_id)
assoc.add_association_to_graph(g)
rlist = a.find('./Gene/ExternalReferenceList')
eqid = None
for r in rlist.findall('ExternalReference'):
if r.find('Source').text == 'Ensembl':
eqid = 'ENSEMBL:'+r.find('Reference').text
elif r.find('Source').text == 'HGNC':
eqid = 'HGNC:'+r.find('Reference').text
elif r.find('Source').text == 'OMIM':
eqid = 'OMIM:'+r.find('Reference').text
else:
pass # skip the others for now
if eqid is not None:
gu.addClassToGraph(g, eqid, None)
gu.addEquivalentClass(g, gene_id, eqid)
elem.clear() # discard the element
if self.testMode and limit is not None and line_counter > limit:
return
gu.loadProperties(
g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
gu.loadAllProperties(g)
return
示例7: _process_diseasegene
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
#.........这里部分代码省略.........
# get the element name and id
# id = elem.get('id') # some internal identifier
disorder_num = elem.find("OrphaNumber").text
disorder_id = "Orphanet:" + str(disorder_num)
if self.testMode and disorder_id not in config.get_config()["test_ids"]["disease"]:
continue
disorder_label = elem.find("Name").text
# make a hash of internal gene id to type for later lookup
gene_iid_to_type = {}
gene_list = elem.find("GeneList")
for gene in gene_list.findall("Gene"):
gene_iid = gene.get("id")
gene_type = gene.find("GeneType").get("id")
gene_iid_to_type[gene_iid] = gene_type
gu.addClassToGraph(g, disorder_id, disorder_label) # assuming that these are in the ontology
assoc_list = elem.find("DisorderGeneAssociationList")
for a in assoc_list.findall("DisorderGeneAssociation"):
gene_iid = a.find(".//Gene").get("id")
gene_name = a.find(".//Gene/Name").text
gene_symbol = a.find(".//Gene/Symbol").text
gene_num = a.find("./Gene/OrphaNumber").text
gene_id = "Orphanet:" + str(gene_num)
gene_type_id = self._map_gene_type_id(gene_iid_to_type[gene_iid])
gu.addClassToGraph(g, gene_id, gene_symbol, gene_type_id, gene_name)
syn_list = a.find("./Gene/SynonymList")
if int(syn_list.get("count")) > 0:
for s in syn_list.findall("./Synonym"):
gu.addSynonym(g, gene_id, s.text)
dgtype = a.find("DisorderGeneAssociationType").get("id")
rel_id = self._map_rel_id(dgtype)
dg_label = a.find("./DisorderGeneAssociationType/Name").text
if rel_id is None:
logger.warn(
"Cannot map association type (%s) to RO for association (%s | %s). Skipping.",
dg_label,
disorder_label,
gene_symbol,
)
continue
alt_locus_id = "_" + gene_num + "-" + disorder_num + "VL"
alt_label = " ".join(
("some variant of", gene_symbol.strip(), "that is a", dg_label.lower(), disorder_label)
)
if self.nobnodes:
alt_locus_id = ":" + alt_locus_id
gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts["variant_locus"])
geno.addAlleleOfGene(alt_locus_id, gene_id)
# consider typing the gain/loss-of-function variants like:
# http://sequenceontology.org/browser/current_svn/term/SO:0002054
# http://sequenceontology.org/browser/current_svn/term/SO:0002053
# use "assessed" status to issue an evidence code
# FIXME I think that these codes are sub-optimal
status_code = a.find("DisorderGeneAssociationStatus").get("id")
eco_id = "ECO:0000323" # imported automatically asserted information used in automatic assertion
if status_code == "17991": # Assessed # TODO are these internal ids stable between releases?
eco_id = "ECO:0000322" # imported manually asserted information used in automatic assertion
# Non-traceable author statement ECO_0000034
# imported information in automatic assertion ECO_0000313
assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id)
assoc.add_evidence(eco_id)
assoc.add_association_to_graph(g)
rlist = a.find("./Gene/ExternalReferenceList")
eqid = None
for r in rlist.findall("ExternalReference"):
if r.find("Source").text == "Ensembl":
eqid = "ENSEMBL:" + r.find("Reference").text
elif r.find("Source").text == "HGNC":
eqid = "HGNC:" + r.find("Reference").text
elif r.find("Source").text == "OMIM":
eqid = "OMIM:" + r.find("Reference").text
else:
pass # skip the others for now
if eqid is not None:
gu.addClassToGraph(g, eqid, None)
gu.addEquivalentClass(g, gene_id, eqid)
pass
elem.clear() # discard the element
if self.testMode and limit is not None and line_counter > limit:
return
gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
gu.loadAllProperties(g)
return
示例8: _get_gene_info
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
#.........这里部分代码省略.........
# TODO might have to figure out if things aren't genes, and make them individuals
gu.addClassToGraph(g, gene_id, label, gene_type_id, desc)
# we have to do special things here for genes, because they're classes not individuals
# f = Feature(gene_id,label,gene_type_id,desc)
if name != '-':
gu.addSynonym(g, gene_id, name)
if synonyms.strip() != '-':
for s in synonyms.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
if other_designations.strip() != '-':
for s in other_designations.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
# deal with the xrefs
# MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
if xrefs.strip() != '-':
for r in xrefs.strip().split('|'):
fixedr = self._cleanup_id(r)
if fixedr is not None and fixedr.strip() != '':
if re.match('HPRD', fixedr):
# proteins are not == genes.
gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr)
else:
# skip some of these for now
if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']:
gu.addEquivalentClass(g, gene_id, fixedr)
# edge cases of id | symbol | chr | map_loc:
# 263 AMD1P2 X|Y with Xq28 and Yq12
# 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR
# 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies
# 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR
# 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility
# 101928066 LOC101928066 1|Un - # unlocated scaffold
# 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 2C3
# 11548 Adra1b 11 11 B1.1|11 25.81 cM # mouse --> 11B1.1
# 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse
# 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse
# 323212 wu:fb92e12 19|20 - # fish
# 323368 ints10 6|18 - # fish
# 323666 wu:fc06e02 11|23 - # fish
# feel that the chr placement can't be trusted in this table when there is > 1 listed
# with the exception of human X|Y, i will only take those that align to one chr
# FIXME remove the chr mapping below when we pull in the genomic coords
if str(chr) != '-' and str(chr) != '':
if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']:
# this means that there's uncertainty in the mapping. skip it
# TODO we'll need to figure out how to deal with >1 loc mapping
logger.info('%s is non-uniquely mapped to %s. Skipping for now.', gene_id, str(chr))
continue
# X|Y Xp22.33;Yp11.3
# if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
# print('odd chr=',str(chr))
if str(chr) == 'X; Y':
chr = 'X|Y' # rewrite the PAR regions for processing
# do this in a loop to allow PAR regions like X|Y
for c in re.split('\|',str(chr)) :
geno.addChromosomeClass(c, tax_id, None) # assume that the chromosome label will get added elsewhere
mychrom = makeChromID(c, tax_num, 'CHR')
mychrom_syn = makeChromLabel(c, tax_num) # temporarily use the taxnum for the disambiguating label
gu.addSynonym(g, mychrom, mychrom_syn)
band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc)
if band_match is not None and len(band_match.groups()) > 0:
# if tax_num != '9606':
# continue
# this matches the regular kind of chrs, so make that kind of band
# not sure why this matches? chrX|Y or 10090chr12|Un"
# TODO we probably need a different regex per organism
# the maploc_id already has the numeric chromosome in it, strip it first
bid = re.sub('^'+c, '', map_loc)
maploc_id = makeChromID(c+bid, tax_num, 'CHR') # the generic location (no coordinates)
# print(map_loc,'-->',bid,'-->',maploc_id)
band = Feature(maploc_id, None, None) # Assume it's type will be added elsewhere
band.addFeatureToGraph(g)
# add the band as the containing feature
gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id)
else:
# TODO handle these cases
# examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24,
## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1, 12cen-q21, 22q13.3|22q13.3
logger.debug('not regular band pattern for %s: %s', gene_id, map_loc)
# add the gene as a subsequence of the chromosome
gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom)
geno.addTaxon(tax_id, gene_id)
if not self.testMode and limit is not None and line_counter > limit:
break
gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
return
示例9: _process_data
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
#.........这里部分代码省略.........
# can change later if desired.
# since the genotype is reflective of the place
# it got made, should put that in to disambiguate
genotype_name = \
genotype_name+' ['+pheno_center_strain_label+']'
geno.addGenomicBackgroundToGenotype(
pheno_center_strain_id, genotype_id)
geno.addTaxon(pheno_center_strain_id, taxon_id)
# this is redundant, but i'll keep in in for now
geno.addSequenceDerivesFrom(genotype_id, colony_id)
genotype_name += '['+colony+']'
geno.addGenotype(genotype_id, genotype_name)
# Make the sex-qualified genotype,
# which is what the phenotype is associated with
sex_qualified_genotype_id = \
self.make_id(
(colony_id + phenotyping_center + zygosity +
strain_accession_id+sex))
sex_qualified_genotype_label = genotype_name+' ('+sex+')'
if sex == 'male':
sq_type_id = geno.genoparts['male_genotype']
elif sex == 'female':
sq_type_id = geno.genoparts['female_genotype']
else:
sq_type_id = geno.genoparts['sex_qualified_genotype']
geno.addGenotype(
sex_qualified_genotype_id,
sex_qualified_genotype_label, sq_type_id)
geno.addParts(
genotype_id, sex_qualified_genotype_id,
geno.object_properties['has_alternate_part'])
if genomic_background_id is not None and \
genomic_background_id != '':
# Add the taxon to the genomic_background_id
geno.addTaxon(taxon_id, genomic_background_id)
else:
# add it as the genomic background
geno.addTaxon(taxon_id, genotype_id)
# ############# BUILD THE G2P ASSOC #############
# from an old email dated July 23 2014:
# Phenotypes associations are made to
# imits colony_id+center+zygosity+gender
phenotype_id = mp_term_id
# it seems that sometimes phenotype ids are missing.
# indicate here
if phenotype_id is None or phenotype_id == '':
logger.warning(
"No phenotype id specified for row %d: %s",
line_counter, str(row))
continue
# experimental_phenotypic_evidence This was used in ZFIN
eco_id = "ECO:0000059"
# the association comes as a result of a g2p from
# a procedure in a pipeline at a center and parameter tested
assoc = G2PAssoc(self.name, sex_qualified_genotype_id,
phenotype_id)
assoc.add_evidence(eco_id)
# assoc.set_score(float(p_value))
# TODO add evidence instance using
# pipeline_stable_id +
# procedure_stable_id +
# parameter_stable_id
assoc.add_association_to_graph(g)
assoc_id = assoc.get_association_id()
# add a free-text description
description = \
' '.join((mp_term_name, 'phenotype determined by',
phenotyping_center, 'in an',
procedure_name, 'assay where',
parameter_name.strip(),
'was measured with an effect_size of',
str(round(float(effect_size), 5)),
'(p =', "{:.4e}".format(float(p_value)), ').'))
gu.addDescription(g, assoc_id, description)
# TODO add provenance information
# resource_id = resource_name
# assoc.addSource(g, assoc_id, resource_id)
if not self.testMode and \
limit is not None and line_counter > limit:
break
gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP)
gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP)
gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP)
return
示例10: UCSCBands
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
#.........这里部分代码省略.........
self.testgraph = self.graph
logger.info("Found %d nodes", len(self.graph))
logger.info("Done parsing files.")
return
def _get_chrbands(self, limit, taxon):
"""
:param limit:
:return:
"""
# TODO PYLINT figure out what limit was for and why it is unused
line_counter = 0
myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
logger.info("Processing Chr bands from FILE: %s", myfile)
geno = Genotype(self.graph)
monochrom = Monochrom()
# used to hold band definitions for a chr
# in order to compute extent of encompasing bands
mybands = {}
# build the organism's genome from the taxon
genome_label = self.files[taxon]['genome_label']
taxon_id = 'NCBITaxon:'+taxon
# add the taxon as a class. adding the class label elsewhere
self.gu.addClassToGraph(self.graph, taxon_id, None)
self.gu.addSynonym(self.graph, taxon_id, genome_label)
self.gu.loadObjectProperties(self.graph, Feature.object_properties)
self.gu.loadProperties(self.graph, Feature.data_properties,
self.gu.DATAPROP)
self.gu.loadAllProperties(self.graph)
geno.addGenome(taxon_id, genome_label)
# add the build and the taxon it's in
build_num = self.files[taxon]['build_num']
build_id = 'UCSC:'+build_num
geno.addReferenceGenome(build_id, build_num, taxon_id)
# process the bands
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match('^#', line):
continue
# chr13 4500000 10000000 p12 stalk
(scaffold, start, stop, band_num, rtype) = line.split('\t')
line_counter += 1
# NOTE some less-finished genomes have
# placed and unplaced scaffolds
# * Placed scaffolds:
# the scaffolds have been placed within a chromosome.
# * Unlocalized scaffolds:
# although the chromosome within which the scaffold occurs
# is known, the scaffold's position or orientation
# is not known.
# * Unplaced scaffolds:
# it is not known which chromosome the scaffold belongs to
示例11: _process_genes
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
def _process_genes(self, limit=None):
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
raw = '/'.join((self.rawdir, self.files['genes']['file']))
line_counter = 0
logger.info("Processing HGNC genes")
with open(raw, 'r', encoding="utf8") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
(hgnc_id, symbol, name, locus_group, locus_type, status,
location, location_sortable, alias_symbol, alias_name,
prev_symbol, prev_name, gene_family, gene_family_id,
date_approved_reserved, date_symbol_changed,
date_name_changed, date_modified, entrez_id, ensembl_gene_id,
vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids,
pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase,
homeodb, snornabase, bioparadigms_slc, orphanet,
pseudogene_org, horde_id, merops, imgt, iuphar,
kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id,
intermediate_filament_db) = row
line_counter += 1
# skip header
if line_counter <= 1:
continue
if self.testMode and entrez_id != '' \
and int(entrez_id) not in self.gene_ids:
continue
if name == '':
name = None
gene_type_id = self._get_gene_type(locus_type)
gu.addClassToGraph(g, hgnc_id, symbol, gene_type_id, name)
if locus_type == 'withdrawn':
gu.addDeprecatedClass(g, hgnc_id)
if entrez_id != '':
gu.addEquivalentClass(
g, hgnc_id, 'NCBIGene:' + entrez_id)
if ensembl_gene_id != '':
gu.addEquivalentClass(
g, hgnc_id, 'ENSEMBL:' + ensembl_gene_id)
geno.addTaxon('NCBITaxon:9606', hgnc_id)
# add pubs as "is about"
if pubmed_id != '':
for p in re.split(r'\|', pubmed_id.strip()):
if str(p) != '':
gu.addTriple(
g, 'PMID:' + str(p.strip()),
gu.object_properties['is_about'], hgnc_id)
# add chr location
# sometimes two are listed, like: 10p11.2 or 17q25
# -- there are only 2 of these FRA10A and MPFD
# sometimes listed like "1 not on reference assembly"
# sometimes listed like 10q24.1-q24.3
# sometimes like 11q11 alternate reference locus
band = chrom = None
chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]'
chr_match = re.match(chr_pattern, location)
if chr_match is not None and len(chr_match.groups()) > 0:
chrom = chr_match.group(1)
chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR')
band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)'
band_match = re.search(band_pattern, location)
f = Feature(hgnc_id, None, None)
if band_match is not None and len(band_match.groups()) > 0:
band = band_match.group(1)
band = chrom + band
# add the chr band as the parent to this gene
# as a feature but assume that the band is created
# as a class with properties elsewhere in Monochrom
# TEC Monoch? Monarchdom??
band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR')
gu.addClassToGraph(g, band_id, None)
f.addSubsequenceOfFeature(g, band_id)
else:
gu.addClassToGraph(g, chrom_id, None)
f.addSubsequenceOfFeature(g, chrom_id)
if not self.testMode \
and limit is not None and line_counter > limit:
break
# end loop through file
gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
#.........这里部分代码省略.........
示例12: _get_variants
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
#.........这里部分代码省略.........
# CHECK - this makes the assumption that there is only one affected chromosome per variant
# what happens with chromosomal rearrangement variants? shouldn't both chromosomes be here?
# add the hgvs as synonyms
if hgvs_c != '-' and hgvs_c.strip() != '':
gu.addSynonym(g, seqalt_id, hgvs_c)
if hgvs_p != '-' and hgvs_p.strip() != '':
gu.addSynonym(g, seqalt_id, hgvs_p)
# add the dbsnp and dbvar ids as equivalent
if dbsnp_num != '-' and int(dbsnp_num) != -1:
dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
gu.addIndividualToGraph(g, dbsnp_id, None)
gu.addSameIndividual(g, seqalt_id, dbsnp_id)
if dbvar_num != '-':
dbvar_id = 'dbVar:'+dbvar_num
gu.addIndividualToGraph(g, dbvar_id, None)
gu.addSameIndividual(g, seqalt_id, dbvar_id)
# TODO - not sure if this is right... add as xref?
# the rcv is like the combo of the phenotype with the variant
if rcv_nums != '-':
for rcv_num in re.split(';',rcv_nums):
rcv_id = 'ClinVar:'+rcv_num
gu.addIndividualToGraph(g, rcv_id, None)
gu.addXref(g, seqalt_id, rcv_id)
if gene_id is not None:
# add the gene
gu.addClassToGraph(g, gene_id, gene_symbol)
# make a variant locus
vl_id = '_'+gene_num+'-'+variant_num
if self.nobnodes:
vl_id = ':'+vl_id
vl_label = allele_name
gu.addIndividualToGraph(g, vl_id, vl_label, geno.genoparts['variant_locus'])
geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
geno.addAlleleOfGene(vl_id, gene_id)
else:
# some basic reporting
gmatch = re.search('\(\w+\)', allele_name)
if gmatch is not None and len(gmatch.groups()) > 0:
logger.info("Gene found in allele label, but no id provided: %s", gmatch.group(1))
elif re.match('more than 10', gene_symbol):
logger.info("More than 10 genes found; need to process XML to fetch (variant=%d)", int(variant_num))
else:
logger.info("No gene listed for variant %d", int(variant_num))
# parse the list of "phenotypes" which are diseases. add them as an association
# ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
# the list is both semicolon delimited and comma delimited, but i don't know why!
# some are bad, like: Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
if phenotype_ids != '-':
for p in pheno_list:
m = re.match("(Orphanet:ORPHA(?:\s*ORPHA)?)", p)
if m is not None and len(m.groups()) > 0:
p = re.sub(m.group(1), 'Orphanet:', p.strip())
elif re.match('SNOMED CT', p):
p = re.sub('SNOMED CT', 'SNOMED', p.strip())
assoc = G2PAssoc(self.name, seqalt_id, p.strip())
assoc.add_association_to_graph(g)
if other_ids != '-':
id_list = other_ids.split(',')
# process the "other ids"
# ex: CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
# TODO make more xrefs
for xrefid in id_list:
prefix = xrefid.split(':')[0].strip()
if prefix == 'OMIM Allelic Variant':
xrefid = 'OMIM:'+xrefid.split(':')[1]
gu.addIndividualToGraph(g, xrefid, None)
gu.addSameIndividual(g, seqalt_id, xrefid)
elif prefix == 'HGMD':
gu.addIndividualToGraph(g, xrefid, None)
gu.addSameIndividual(g, seqalt_id, xrefid)
elif prefix == 'dbVar' and dbvar_num == xrefid.split(':')[1].strip():
pass # skip over this one
elif re.search('\s', prefix):
pass
# logger.debug('xref prefix has a space: %s', xrefid)
else:
# should be a good clean prefix
# note that HGMD variants are in here as Xrefs because we can't resolve URIs for them
# logger.info("Adding xref: %s", xrefid)
# gu.addXref(g, seqalt_id, xrefid)
# logger.info("xref prefix to add: %s", xrefid)
pass
if not self.testMode and limit is not None and line_counter > limit:
break
gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP)
gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP)
gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP)
logger.info("Finished parsing variants")
return
示例13: _get_orthologs
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
#.........这里部分代码省略.........
for k in self.files.keys():
f = '/'.join((self.rawdir, self.files[k]['file']))
matchcounter = 0
mytar = tarfile.open(f, 'r:gz')
# assume that the first entry is the item
fname = mytar.getmembers()[0]
logger.info("Parsing %s", fname.name)
line_counter = 0
with mytar.extractfile(fname) as csvfile:
for line in csvfile:
# skip comment lines
if re.match('^#', line.decode()):
logger.info("Skipping header line")
continue
line_counter += 1
# a little feedback to the user since there's so many
if line_counter % 1000000 == 0:
logger.info("Processed %d lines from %s", line_counter, fname.name)
line = line.decode().strip()
# parse each row
# HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83 MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6 LDO Euarchontoglires PTHR15964
(a, b, orthology_class, ancestor_taxon, panther_id) = line.split('\t')
(species_a, gene_a, protein_a) = a.split('|')
(species_b, gene_b, protein_b) = b.split('|')
# skip the entries that don't have homolog relationships with the test ids
if self.testMode and not (re.sub('UniProtKB=', '', protein_a) in self.test_ids or
re.sub('UniProtKB=', '', protein_b) in self.test_ids):
continue
# map the taxon abbreviations to ncbi taxon ids
taxon_a = self._map_taxon_abbr_to_id(species_a)
taxon_b = self._map_taxon_abbr_to_id(species_b)
# ###uncomment the following code block if you want to filter based on taxid
# taxids = [9606,10090,10116,7227,7955,6239,8355] #our favorite animals
# taxids = [9606] #human only
# retain only those orthologous relationships to genes in the specified taxids
# using AND will get you only those associations where gene1 AND gene2 are in the taxid list (most-filter)
# using OR will get you any associations where gene1 OR gene2 are in the taxid list (some-filter)
if (self.tax_ids is not None and
(int(re.sub('NCBITaxon:', '', taxon_a.rstrip())) not in self.tax_ids) and
(int(re.sub('NCBITaxon:', '', taxon_b.rstrip())) not in self.tax_ids)):
continue
else:
matchcounter += 1
if limit is not None and matchcounter > limit:
break
# ###end code block for filtering on taxon
# fix the gene identifiers
gene_a = re.sub('=', ':', gene_a)
gene_b = re.sub('=', ':', gene_b)
clean_gene = self._clean_up_gene_id(gene_a, species_a)
if clean_gene is None:
unprocessed_gene_ids.add(gene_a)
gene_a = clean_gene
clean_gene = self._clean_up_gene_id(gene_b, species_b)
if clean_gene is None:
unprocessed_gene_ids.add(gene_b)
gene_b = clean_gene
# a special case here; mostly some rat genes they use symbols instead of identifiers. will skip
if gene_a is None or gene_b is None:
continue
rel = self._map_orthology_code_to_RO(orthology_class)
evidence_id = 'ECO:0000080' # phylogenetic evidence
# add the association and relevant nodes to graph
assoc = OrthologyAssoc(self.name, gene_a, gene_b, rel)
assoc.add_evidence(evidence_id)
# add genes to graph; assume labels will be taken care of elsewhere
gu.addClassToGraph(g, gene_a, None)
gu.addClassToGraph(g, gene_b, None)
assoc.add_association_to_graph(g)
# note this is incomplete... it won't construct the full family hierarchy, just the top-grouping
assoc.add_gene_family_to_graph(g, ':'.join(('PANTHER', panther_id)))
if not self.testMode and limit is not None and line_counter > limit:
break
logger.info("finished processing %s", f)
logger.warn("The following gene ids were unable to be processed: %s", str(unprocessed_gene_ids))
gu.loadProperties(g, OrthologyAssoc.object_properties, gu.OBJPROP)
gu.loadProperties(g, OrthologyAssoc.annotation_properties, gu.ANNOTPROP)
return
示例14: CTD
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
#.........这里部分代码省略.........
:param limit (int, optional) limit the number of rows processed
Returns:
:return None
"""
if limit is not None:
logger.info("Only parsing first %d rows", limit)
logger.info("Parsing files...")
# pub_map = dict()
# file_path = '/'.join((self.rawdir,
# self.static_files['publications']['file']))
# if os.path.exists(file_path) is True:
# pub_map = self._parse_publication_file(
# self.static_files['publications']['file']
# )
if self.testOnly:
self.testMode = True
if self.testMode:
self.g = self.testgraph
else:
self.g = self.graph
self.geno = Genotype(self.g)
self.path = Pathway(self.g, self.nobnodes)
self._parse_ctd_file(
limit, self.files['chemical_disease_interactions']['file'])
self._parse_ctd_file(limit, self.files['gene_pathway']['file'])
self._parse_ctd_file(limit, self.files['gene_disease']['file'])
self._parse_curated_chem_disease(limit)
self.gu.loadAllProperties(self.g)
self.gu.loadProperties(
self.g, G2PAssoc.object_properties, self.gu.OBJPROP)
self.gu.loadProperties(
self.g, G2PAssoc.datatype_properties, self.gu.DATAPROP)
self.gu.loadProperties(
self.g, G2PAssoc.annotation_properties, self.gu.ANNOTPROP)
self.gu.loadProperties(
self.g, Pathway.object_properties, self.gu.OBJPROP)
self.load_bindings()
logger.info("Done parsing files.")
return
def _parse_ctd_file(self, limit, file):
"""
Parses files in CTD.files dictionary
Args:
:param limit (int): limit the number of rows processed
:param file (str): file name (must be defined in CTD.file)
Returns:
:return None
"""
row_count = 0
version_pattern = re.compile(r'^# Report created: (.+)$')
is_versioned = False
file_path = '/'.join((self.rawdir, file))
with gzip.open(file_path, 'rt') as tsvfile:
reader = csv.reader(tsvfile, delimiter="\t")
for row in reader:
# Scan the header lines until we get the version
# There is no official version sp we are using
# the upload timestamp instead
示例15: process_catalog
# 需要导入模块: from dipper.utils.GraphUtils import GraphUtils [as 别名]
# 或者: from dipper.utils.GraphUtils.GraphUtils import loadProperties [as 别名]
def process_catalog(self, limit=None):
"""
:param limit:
:return:
"""
raw = '/'.join((self.rawdir, self.files['catalog']['file']))
logger.info("Processing Data from %s", raw)
gu = GraphUtils(curie_map.get())
if self.testMode: # set the graph to build
g = self.testgraph
else:
g = self.graph
line_counter = 0
geno = Genotype(g)
gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
gu.loadAllProperties(g)
tax_id = 'NCBITaxon:9606' # hardcode
genome_version = 'GRCh38' # hardcode
# build a hashmap of genomic location to identifiers,
# to try to get the equivalences
loc_to_id_hash = {}
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
next(filereader, None) # skip the header row
for row in filereader:
if not row:
pass
else:
line_counter += 1
(date_added_to_catalog, pubmed_num, first_author, pub_date,
journal, link, study_name, disease_or_trait,
initial_sample_description, replicate_sample_description,
region, chrom_num, chrom_pos, reported_gene_nums,
mapped_gene, upstream_gene_num, downstream_gene_num,
snp_gene_nums, upstream_gene_distance,
downstream_gene_distance, strongest_snp_risk_allele, snps,
merged, snp_id_current, context, intergenic_flag,
risk_allele_frequency, pvalue, pvalue_mlog, pvalue_text,
or_or_beta, confidence_interval_95,
platform_with_snps_passing_qc, cnv_flag, mapped_trait,
mapped_trait_uri) = row
intersect = \
list(set([str(i) for i in self.test_ids['gene']]) &
set(re.split(r',', snp_gene_nums)))
# skip if no matches found in test set
if self.testMode and len(intersect) == 0:
continue
# 06-May-2015 25917933 Zai CC 20-Nov-2014 J Psychiatr Res http://europepmc.org/abstract/MED/25917933
# A genome-wide association study of suicide severity scores in bipolar disorder.
# Suicide in bipolar disorder
# 959 European ancestry individuals NA
# 10p11.22 10 32704340 C10orf68, CCDC7, ITGB1 CCDC7
# rs7079041-A rs7079041 0 7079041 intron 0 2E-6 5.698970
if chrom_num != '' and chrom_pos != '':
loc = 'chr'+str(chrom_num)+':'+str(chrom_pos)
if loc not in loc_to_id_hash:
loc_to_id_hash[loc] = set()
else:
loc = None
if re.search(r' x ', strongest_snp_risk_allele) \
or re.search(r',', strongest_snp_risk_allele):
# TODO deal with haplotypes
logger.warning(
"We can't deal with haplotypes yet: %s",
strongest_snp_risk_allele)
continue
elif re.match(r'rs', strongest_snp_risk_allele):
rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip()
# remove the alteration
elif re.match(r'kgp', strongest_snp_risk_allele):
# FIXME this isn't correct
rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip()
# http://www.1000genomes.org/faq/what-are-kgp-identifiers
# for some information
# They were created by Illumina for their genotyping
# platform before some variants identified during the
# pilot phase of the project had been assigned
# rs numbers.
elif re.match(r'chr', strongest_snp_risk_allele):
# like: chr10:106180121-G
rs_id = ':gwas-' + \
re.sub(
r':', '-', strongest_snp_risk_allele.strip())
elif strongest_snp_risk_allele.strip() == '':
# logger.debug(
# "No strongest SNP risk allele for %s:\n%s",
# pubmed_num, str(row))
# FIXME still consider adding in the EFO terms
# for what the study measured?
#.........这里部分代码省略.........