本文整理汇总了Python中dipper.models.Genotype.Genotype.addReferenceGenome方法的典型用法代码示例。如果您正苦于以下问题:Python Genotype.addReferenceGenome方法的具体用法?Python Genotype.addReferenceGenome怎么用?Python Genotype.addReferenceGenome使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.models.Genotype.Genotype
的用法示例。
在下文中一共展示了Genotype.addReferenceGenome方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addReferenceGenome [as 别名]
def parse(self, limit=None):
"""
:param limit:
:return:
"""
if limit is not None:
logger.info("Only parsing first %s rows fo each file", str(limit))
logger.info("Parsing files...")
if self.testOnly:
self.testMode = True
g = self.testgraph
else:
g = self.graph
tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
self._process_trait_mappings(tmap, limit)
geno = Genotype(g)
# organisms = ['chicken']
organisms = [
'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']
for o in organisms:
tax_id = self._get_tax_by_common_name(o)
geno.addGenome(tax_id, o)
build_id = None
build = None
k = o+'_bp'
if k in self.files:
file = self.files[k]['file']
m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file)
if m is None:
logger.error("Can't match a gff build")
else:
build = m.group(1)
build_id = self._map_build_by_abbrev(build)
logger.info("Build = %s", build_id)
geno.addReferenceGenome(build_id, build, tax_id)
if build_id is not None:
self._process_QTLs_genomic_location(
'/'.join((self.rawdir, file)), tax_id, build_id, build,
limit)
k = o+'_cm'
if k in self.files:
file = self.files[k]['file']
self._process_QTLs_genetic_location(
'/'.join((self.rawdir, file)), tax_id, o, limit)
logger.info("Finished parsing")
self.load_bindings()
logger.info("Found %d nodes", len(self.graph))
return
示例2: _create_genome_builds
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addReferenceGenome [as 别名]
def _create_genome_builds(self):
"""
Various resources will map variations to either UCSC (hg*)
or to NCBI assemblies. Here we create the equivalences between them.
Data taken from:
https://genome.ucsc.edu/FAQ/FAQreleases.html#release1
:return:
"""
# TODO add more species
graph = self.graph
geno = Genotype(graph)
model = Model(graph)
LOG.info("Adding equivalent assembly identifiers")
for sp in self.species:
tax_id = self.globaltt[sp]
txid_num = tax_id.split(':')[1]
for key in self.files[txid_num]['assembly']:
ucsc_id = key
try:
ucsc_label = ucsc_id.split(':')[1]
except IndexError:
LOG.error('%s Assembly id: "%s" is problematic', sp, key)
continue
if key in self.localtt:
mapped_id = self.localtt[key]
else:
LOG.error(
'%s Assembly id: "%s" is not in local translation table',
sp, key)
mapped_label = mapped_id.split(':')[1]
mapped_label = 'NCBI build ' + str(mapped_label)
geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
model.addSameIndividual(ucsc_id, mapped_id)
return
示例3: _get_variants
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addReferenceGenome [as 别名]
#.........这里部分代码省略.........
# if ((self.filter == 'taxids' and\
# (int(tax_num) not in self.tax_ids)) or\
# (self.filter == 'geneids' and\
# (int(gene_num) not in self.gene_ids))):
# continue
# #### end filter
line_counter += 1
pheno_list = []
if phenotype_ids != '-':
# trim any leading/trailing semicolons/commas
phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
pheno_list = re.split(r'[,;]', phenotype_ids)
if self.testMode:
# get intersection of test disease ids
# and these phenotype_ids
intersect = \
list(
set([str(i)
for i in self.disease_ids]) & set(pheno_list))
if int(gene_num) not in self.gene_ids and\
int(variant_num) not in self.variant_ids and\
len(intersect) < 1:
continue
# TODO may need to switch on assembly to create correct
# assembly/build identifiers
build_id = ':'.join(('NCBIGenome', assembly))
# make the reference genome build
geno.addReferenceGenome(build_id, assembly, tax_id)
allele_type_id = self._map_type_of_allele(allele_type)
bandinbuild_id = None
if str(chr) == '':
# check cytogenic location
if str(cytogenetic_loc).strip() != '':
# use cytogenic location to get the apx location
# oddly, they still put an assembly number even when
# there's no numeric location
if not re.search(r'-', str(cytogenetic_loc)):
band_id = makeChromID(
re.split(r'-', str(cytogenetic_loc)),
tax_num, 'CHR')
geno.addChromosomeInstance(
cytogenetic_loc, build_id, assembly, band_id)
bandinbuild_id = makeChromID(
re.split(r'-', str(cytogenetic_loc)),
assembly, 'MONARCH')
else:
# can't deal with ranges yet
pass
else:
# add the human chromosome class to the graph,
# and add the build-specific version of it
chr_id = makeChromID(str(chr), tax_num, 'CHR')
geno.addChromosomeClass(str(chr), tax_id, tax_label)
geno.addChromosomeInstance(
str(chr), build_id, assembly, chr_id)
chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')
seqalt_id = ':'.join(('ClinVarVariant', variant_num))
gene_id = None
示例4: _create_genome_builds
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addReferenceGenome [as 别名]
def _create_genome_builds(self):
"""
Various resources will map variations to either UCSC (hg*)
or to NCBI assemblies. Here we create the equivalences between them.
Data taken from:
https://genome.ucsc.edu/FAQ/FAQreleases.html#release1
:return:
"""
# TODO add more species
ucsc_assembly_id_map = {
"9606": {
"UCSC:hg38": "NCBIGenome:GRCh38",
"UCSC:hg19": "NCBIGenome:GRCh37",
"UCSC:hg18": "NCBIGenome:36.1",
"UCSC:hg17": "NCBIGenome:35",
"UCSC:hg16": "NCBIGenome:34",
"UCSC:hg15": "NCBIGenome:33",
},
"7955": {
"UCSC:danRer10": "NCBIGenome:GRCz10",
"UCSC:danRer7": "NCBIGenome:Zv9",
"UCSC:danRer6": "NCBIGenome:Zv8",
},
"10090": {
"UCSC:mm10": "NCBIGenome:GRCm38",
"UCSC:mm9": "NCBIGenome:37"
},
"9031": {
"UCSC:galGal4": "NCBIAssembly:317958",
},
"9913": {
"UCSC:bosTau7": "NCBIAssembly:GCF_000003205.5",
},
"9823": {
"UCSC:susScr3": "NCBIAssembly:304498",
},
"9940": {
"UCSC:oviAri3": "NCBIAssembly:GCF_000298735.1",
},
"9796": {
"UCSC:equCab2": "NCBIAssembly:GCF_000002305.2",
}
}
g = self.graph
geno = Genotype(g)
model = Model(g)
logger.info("Adding equivalent assembly identifiers")
for sp in ucsc_assembly_id_map:
tax_num = sp
tax_id = 'NCBITaxon:'+tax_num
mappings = ucsc_assembly_id_map[sp]
for i in mappings:
ucsc_id = i
ucsc_label = re.split(':', i)[1]
mapped_id = mappings[i]
mapped_label = re.split(':', mapped_id)[1]
mapped_label = 'NCBI build '+str(mapped_label)
geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
model.addSameIndividual(ucsc_id, mapped_id)
return
示例5: _get_chrbands
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addReferenceGenome [as 别名]
def _get_chrbands(self, limit, taxon):
"""
:param limit:
:return:
"""
model = Model(self.graph)
# TODO PYLINT figure out what limit was for and why it is unused
line_counter = 0
myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
logger.info("Processing Chr bands from FILE: %s", myfile)
geno = Genotype(self.graph)
monochrom = Monochrom(self.graph_type, self.are_bnodes_skized)
# used to hold band definitions for a chr
# in order to compute extent of encompasing bands
mybands = {}
# build the organism's genome from the taxon
genome_label = self.files[taxon]['genome_label']
taxon_id = 'NCBITaxon:'+taxon
# add the taxon as a class. adding the class label elsewhere
model.addClassToGraph(taxon_id, None)
model.addSynonym(taxon_id, genome_label)
geno.addGenome(taxon_id, genome_label)
# add the build and the taxon it's in
build_num = self.files[taxon]['build_num']
build_id = 'UCSC:'+build_num
geno.addReferenceGenome(build_id, build_num, taxon_id)
# process the bands
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match('^#', line):
continue
# chr13 4500000 10000000 p12 stalk
(scaffold, start, stop, band_num, rtype) = line.split('\t')
line_counter += 1
# NOTE some less-finished genomes have
# placed and unplaced scaffolds
# * Placed scaffolds:
# the scaffolds have been placed within a chromosome.
# * Unlocalized scaffolds:
# although the chromosome within which the scaffold occurs
# is known, the scaffold's position or orientation
# is not known.
# * Unplaced scaffolds:
# it is not known which chromosome the scaffold belongs to
#
# find out if the thing is a full on chromosome, or a scaffold:
# ex: unlocalized scaffold: chr10_KL568008v1_random
# ex: unplaced scaffold: chrUn_AABR07022428v1
placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))'
unlocalized_scaffold_pattern = \
placed_scaffold_pattern+r'_(\w+)_random'
unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)'
m = re.match(placed_scaffold_pattern+r'$', scaffold)
if m is not None and len(m.groups()) == 1:
# the chromosome is the first match of the pattern
chrom_num = m.group(1)
else:
# skip over anything that isn't a placed_scaffold
# at the class level
logger.info("Found non-placed chromosome %s", scaffold)
chrom_num = None
m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold)
m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold)
scaffold_num = None
if m:
pass
elif m_chr_unloc is not None and\
len(m_chr_unloc.groups()) == 2:
chrom_num = m_chr_unloc.group(1)
scaffold_num = chrom_num+'_'+m_chr_unloc.group(2)
elif m_chr_unplaced is not None and\
len(m_chr_unplaced.groups()) == 1:
scaffold_num = m_chr_unplaced.group(1)
else:
logger.error(
"There's a chr pattern that we aren't matching: %s",
scaffold)
if chrom_num is not None:
# the chrom class (generic) id
chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')
# first, add the chromosome class (in the taxon)
geno.addChromosomeClass(
chrom_num, taxon_id, self.files[taxon]['genome_label'])
#.........这里部分代码省略.........
示例6: _process_qtls_genetic_location
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addReferenceGenome [as 别名]
def _process_qtls_genetic_location(
self, raw, txid, common_name, limit=None):
"""
This function processes
Triples created:
:param limit:
:return:
"""
aql_curie = self.files[common_name + '_cm']['curie']
if self.test_mode:
graph = self.testgraph
else:
graph = self.graph
line_counter = 0
geno = Genotype(graph)
model = Model(graph)
eco_id = self.globaltt['quantitative trait analysis evidence']
taxon_curie = 'NCBITaxon:' + txid
LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(qtl_id,
qtl_symbol,
trait_name,
assotype,
empty,
chromosome,
position_cm,
range_cm,
flankmark_a2,
flankmark_a1,
peak_mark,
flankmark_b1,
flankmark_b2,
exp_id,
model_id,
test_base,
sig_level,
lod_score,
ls_mean,
p_values,
f_statistics,
variance,
bayes_value,
likelihood_ratio,
trait_id, dom_effect,
add_effect,
pubmed_id,
gene_id,
gene_id_src,
gene_id_type,
empty2) = row
if self.test_mode and int(qtl_id) not in self.test_ids:
continue
qtl_id = common_name + 'QTL:' + qtl_id.strip()
trait_id = ':'.join((aql_curie, trait_id.strip()))
# Add QTL to graph
feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
feature.addTaxonToFeature(taxon_curie)
# deal with the chromosome
chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')
# add a version of the chromosome which is defined as
# the genetic map
build_id = 'MONARCH:'+common_name.strip()+'-linkage'
build_label = common_name+' genetic map'
geno.addReferenceGenome(build_id, build_label, taxon_curie)
chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
geno.addChromosomeInstance(
chromosome, build_id, build_label, chrom_id)
start = stop = None
# range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov)
range_mb = re.split(r'\(', range_cm)
if range_mb is not None:
range_cm = range_mb[0]
if re.search(r'[0-9].*-.*[0-9]', range_cm):
range_parts = re.split(r'-', range_cm)
# check for poorly formed ranges
if len(range_parts) == 2 and\
range_parts[0] != '' and range_parts[1] != '':
(start, stop) = [
int(float(x.strip())) for x in re.split(r'-', range_cm)]
else:
LOG.info(
"A cM range we can't handle for QTL %s: %s",
qtl_id, range_cm)
#.........这里部分代码省略.........
示例7: parse
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addReferenceGenome [as 别名]
def parse(self, limit=None):
"""
:param limit:
:return:
"""
if limit is not None:
LOG.info("Only parsing first %s rows fo each file", str(limit))
LOG.info("Parsing files...")
if self.test_only:
self.test_mode = True
graph = self.testgraph
else:
graph = self.graph
traitmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
self._process_trait_mappings(traitmap, limit)
geno = Genotype(graph)
animals = ['chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']
for common_name in animals:
txid_num = self.resolve(common_name).split(':')[1]
taxon_label = self.localtt[common_name]
taxon_curie = self.globaltt[taxon_label]
taxon_num = taxon_curie.split(':')[1]
txid_num = taxon_num # for now
taxon_word = taxon_label.replace(' ', '_')
gene_info_file = '/'.join((
self.rawdir, self.files[taxon_word + '_info']['file']))
self.gene_info = list()
LOG.info('Ingesting %s', gene_info_file)
with gzip.open(gene_info_file, 'rt') as gi_gz:
filereader = csv.reader(gi_gz, delimiter='\t')
for row in filereader:
if row[0][0] == '#':
continue
else:
self.gene_info.append(str(row[1])) # tossing lots of good stuff
LOG.info(
'Gene Info for %s has %i enteries', common_name, len(self.gene_info))
# LOG.info('Gene Info entery looks like %s', self.gene_info[5])
build = None
fname_bp = common_name + '_bp'
if fname_bp in self.files:
bpfile = self.files[fname_bp]['file']
mch = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', bpfile)
if mch is None:
LOG.error("Can't match a gff build to " + fname_bp)
else:
build = mch.group(1)
build_id = self.localtt[build]
LOG.info("Build UCSC label is: %s", build_id)
# NCBI assembly curie is
geno.addReferenceGenome(build_id, build, txid_num)
if build_id is not None:
self._process_qtls_genomic_location(
'/'.join((self.rawdir, bpfile)), txid_num, build_id, build,
common_name, limit)
fname_cm = common_name + '_cm'
if fname_cm in self.files:
cmfile = self.files[fname_cm]['file']
self._process_qtls_genetic_location(
'/'.join((self.rawdir, cmfile)), txid_num, common_name, limit)
LOG.info("Finished parsing")
return
示例8: _process_QTLs_genetic_location
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addReferenceGenome [as 别名]
def _process_QTLs_genetic_location(self, raw, taxon_id, common_name, limit=None):
"""
This function processes
Triples created:
:param limit:
:return:
"""
if self.testMode:
g = self.testgraph
else:
g = self.graph
line_counter = 0
geno = Genotype(g)
gu = GraphUtils(curie_map.get())
eco_id = "ECO:0000061" # Quantitative Trait Analysis Evidence
logger.info("Processing genetic location for %s", taxon_id)
with open(raw, 'r', encoding="iso-8859-1") as csvfile:
filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
for row in filereader:
line_counter += 1
(qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm,
flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model, test_base,
sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio,
trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row
if self.testMode and int(qtl_id) not in self.test_ids:
continue
qtl_id = 'AQTL:'+qtl_id
trait_id = 'AQTLTrait:'+trait_id
# Add QTL to graph
f = Feature(qtl_id, qtl_symbol, geno.genoparts['QTL'])
f.addTaxonToFeature(g, taxon_id)
# deal with the chromosome
chrom_id = makeChromID(chromosome, taxon_id, 'CHR')
# add a version of the chromosome which is defined as the genetic map
build_id = 'MONARCH:'+common_name.strip()+'-linkage'
build_label = common_name+' genetic map'
geno.addReferenceGenome(build_id, build_label, taxon_id)
chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
geno.addChromosomeInstance(chromosome, build_id, build_label, chrom_id)
start = stop = None
if re.search('-', range_cm):
range_parts = re.split('-', range_cm)
# check for poorly formed ranges
if len(range_parts) == 2 and range_parts[0] != '' and range_parts[1] != '':
(start, stop) = [int(float(x.strip())) for x in re.split('-', range_cm)]
else:
logger.info("There's a cM range we can't handle for QTL %s: %s", qtl_id, range_cm)
elif position_cm != '':
start = stop = int(float(position_cm))
# FIXME remove converion to int for start/stop when schema can handle floats
# add in the genetic location based on the range
f.addFeatureStartLocation(start, chrom_in_build_id, None, [Feature.types['FuzzyPosition']])
f.addFeatureEndLocation(stop, chrom_in_build_id, None, [Feature.types['FuzzyPosition']])
f.addFeatureToGraph(g)
# sometimes there's a peak marker, like a rsid. we want to add that as a variant of the gene,
# and xref it to the qtl.
dbsnp_id = None
if peak_mark != '' and peak_mark != '.' and re.match('rs', peak_mark.strip()):
dbsnp_id = 'dbSNP:'+peak_mark.strip()
gu.addIndividualToGraph(g, dbsnp_id, None, geno.genoparts['sequence_alteration'])
gu.addXref(g, qtl_id, dbsnp_id)
if gene_id is not None and gene_id != '' and gene_id != '.':
if gene_id_src == 'NCBIgene' or gene_id_src == '': # we assume if no src is provided, it's NCBI
gene_id = 'NCBIGene:'+gene_id.strip()
geno.addGene(gene_id, None) # we will expect that these labels provided elsewhere
geno.addAlleleOfGene(qtl_id, gene_id, geno.object_properties['feature_to_gene_relation']) # FIXME what is the right relationship here?
if dbsnp_id is not None:
# add the rsid as a seq alt of the gene_id
vl_id = '_' + re.sub(':', '', gene_id) + '-' + peak_mark
if self.nobnodes:
vl_id = ':' + vl_id
geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id)
geno.addAlleleOfGene(vl_id, gene_id)
# add the trait
gu.addClassToGraph(g, trait_id, trait_name)
# Add publication
r = None
if re.match('ISU.*', pubmed_id):
pub_id = 'AQTLPub:'+pubmed_id.strip()
r = Reference(pub_id)
elif pubmed_id != '':
pub_id = 'PMID:'+pubmed_id.strip()
r = Reference(pub_id, Reference.ref_types['journal_article'])
if r is not None:
#.........这里部分代码省略.........