本文整理汇总了Python中dipper.models.Genotype.Genotype.addGenome方法的典型用法代码示例。如果您正苦于以下问题:Python Genotype.addGenome方法的具体用法?Python Genotype.addGenome怎么用?Python Genotype.addGenome使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类dipper.models.Genotype.Genotype
的用法示例。
在下文中一共展示了Genotype.addGenome方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addGenome [as 别名]
def parse(self, limit=None):
"""
:param limit:
:return:
"""
if limit is not None:
logger.info("Only parsing first %s rows fo each file", str(limit))
logger.info("Parsing files...")
if self.testOnly:
self.testMode = True
g = self.testgraph
else:
g = self.graph
tmap = '/'.join((self.rawdir, self.files['trait_mappings']['file']))
self._process_trait_mappings(tmap, limit)
geno = Genotype(g)
# organisms = ['chicken']
organisms = [
'chicken', 'pig', 'horse', 'rainbow_trout', 'sheep', 'cattle']
for o in organisms:
tax_id = self._get_tax_by_common_name(o)
geno.addGenome(tax_id, o)
build_id = None
build = None
k = o+'_bp'
if k in self.files:
file = self.files[k]['file']
m = re.search(r'QTL_([\w\.]+)\.gff.txt.gz', file)
if m is None:
logger.error("Can't match a gff build")
else:
build = m.group(1)
build_id = self._map_build_by_abbrev(build)
logger.info("Build = %s", build_id)
geno.addReferenceGenome(build_id, build, tax_id)
if build_id is not None:
self._process_QTLs_genomic_location(
'/'.join((self.rawdir, file)), tax_id, build_id, build,
limit)
k = o+'_cm'
if k in self.files:
file = self.files[k]['file']
self._process_QTLs_genetic_location(
'/'.join((self.rawdir, file)), tax_id, o, limit)
logger.info("Finished parsing")
self.load_bindings()
logger.info("Found %d nodes", len(self.graph))
return
示例2: _process_all
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addGenome [as 别名]
def _process_all(self, limit):
"""
This takes the list of omim identifiers from the omim.txt.Z file,
and iteratively queries the omim api for the json-formatted data.
This will create OMIM classes, with the label,
definition, and some synonyms.
If an entry is "removed",
it is added as a deprecated class.
If an entry is "moved",
it is deprecated and consider annotations are added.
Additionally, we extract:
*phenotypicSeries ids as superclasses
*equivalent ids for Orphanet and UMLS
If set to testMode,
it will write only those items in the test_ids to the testgraph.
:param limit:
:return:
"""
omimids = self._get_omim_ids() # store the set of omim identifiers
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
model = Model(g)
# tax_num = '9606' # TODO PYLINT unused
tax_id = 'NCBITaxon:9606'
tax_label = 'Human'
# add genome and taxon
geno.addGenome(tax_id, tax_label) # tax label can get added elsewhere
model.addClassToGraph(tax_id, None) # label added elsewhere
includes = set()
includes.add('all')
self.process_entries(
omimids, self._transform_entry, includes, g, limit)
return
示例3: _get_variants
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addGenome [as 别名]
def _get_variants(self, limit):
"""
Currently loops through the variant_summary file.
:param limit:
:return:
"""
if self.testMode:
g = self.testgraph
else:
g = self.graph
model = Model(g)
geno = Genotype(g)
f = Feature(g, None, None, None)
# add the taxon and the genome
tax_num = '9606' # HARDCODE
tax_id = 'NCBITaxon:'+tax_num
tax_label = 'Human'
model.addClassToGraph(tax_id, None)
geno.addGenome(tax_id, tax_label) # label gets added elsewhere
# not unzipping the file
logger.info("Processing Variant records")
line_counter = 0
myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match(r'^#', line):
continue
# AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML)
# Type character, the type of variation
# Name character, the preferred name for the variation
# GeneID integer, GeneID in NCBI's Gene database
# GeneSymbol character, comma-separated list of GeneIDs overlapping the variation
# ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation
# for the mapping between the terms listed here and the integers in the .VCF files, see
# http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
# RS# (dbSNP) integer, rs# in dbSNP
# nsv (dbVar) character, the NSV identifier for the region in dbVar
# RCVaccession character, list of RCV accessions that report this variant
# TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
# PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant
# Origin character, list of all allelic origins for this variation
# Assembly character, name of the assembly on which locations are based
# Chromosome character, chromosomal location
# Start integer, starting location, in pter->qter orientation
# Stop integer, end location, in pter->qter orientation
# Cytogenetic character, ISCN band
# ReviewStatus character, highest review status for reporting this measure. For the key to the terms,
# and their relationship to the star graphics ClinVar displays on its web pages,
# see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
# HGVS(c.) character, RefSeq cDNA-based HGVS expression
# HGVS(p.) character, RefSeq protein-based HGVS expression
# NumberSubmitters integer, number of submissions with this variant
# LastEvaluated datetime, the latest time any submitter reported clinical significance
# Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene
# (NOTE: if ACMG, not a specific to the allele but to the Gene)
# OtherIDs character, list of other identifiers or sources of information about this variant
# VariantID integer, the value used to build the URL for the current default report,
# e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
#
# a crude check that there's an expected number of cols.
# if not, error out because something changed.
num_cols = len(line.split('\t'))
expected_numcols = 29
if num_cols != expected_numcols:
logger.error(
"Unexpected number of columns in raw file " +
"(%d actual vs %d expected)",
num_cols, expected_numcols)
(allele_num, allele_type, allele_name, gene_num, gene_symbol,
clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
number_of_submitters, last_eval, guidelines, other_ids,
variant_num, reference_allele, alternate_allele, categories,
ChromosomeAccession) = line.split('\t')
# ###set filter=None in init if you don't want to have a filter
# if self.filter is not None:
# if ((self.filter == 'taxids' and\
# (int(tax_num) not in self.tax_ids)) or\
# (self.filter == 'geneids' and\
# (int(gene_num) not in self.gene_ids))):
# continue
# #### end filter
line_counter += 1
pheno_list = []
#.........这里部分代码省略.........
示例4: _get_gene_info
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addGenome [as 别名]
def _get_gene_info(self, limit):
"""
Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their
label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as
protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located
on the chr band.
:param limit:
:return:
"""
gu = GraphUtils(curie_map.get())
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
# not unzipping the file
logger.info("Processing Gene records")
line_counter = 0
myfile = '/'.join((self.rawdir, self.files['gene_info']['file']))
logger.info("FILE: %s", myfile)
# Add taxa and genome classes for those in our filter
for tax_num in self.tax_ids:
tax_id = ':'.join(('NCBITaxon', str(tax_num)))
geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere
gu.addClassToGraph(g, tax_id, None) # label added elsewhere
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match('^#', line):
continue
(tax_num, gene_num, symbol, locustag,
synonyms, xrefs, chr, map_loc, desc,
gtype, authority_symbol, name,
nomenclature_status, other_designations, modification_date) = line.split('\t')
##### set filter=None in init if you don't want to have a filter
#if self.filter is not None:
# if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
# or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
# continue
##### end filter
if self.testMode and int(gene_num) not in self.gene_ids:
continue
if int(tax_num) not in self.tax_ids:
continue
line_counter += 1
gene_id = ':'.join(('NCBIGene', gene_num))
tax_id = ':'.join(('NCBITaxon', tax_num))
gene_type_id = self._map_type_of_gene(gtype)
if symbol == 'NEWENTRY':
label = None
else:
label = symbol
# TODO might have to figure out if things aren't genes, and make them individuals
gu.addClassToGraph(g, gene_id, label, gene_type_id, desc)
# we have to do special things here for genes, because they're classes not individuals
# f = Feature(gene_id,label,gene_type_id,desc)
if name != '-':
gu.addSynonym(g, gene_id, name)
if synonyms.strip() != '-':
for s in synonyms.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
if other_designations.strip() != '-':
for s in other_designations.split('|'):
gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
# deal with the xrefs
# MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
if xrefs.strip() != '-':
for r in xrefs.strip().split('|'):
fixedr = self._cleanup_id(r)
if fixedr is not None and fixedr.strip() != '':
if re.match('HPRD', fixedr):
# proteins are not == genes.
gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr)
else:
# skip some of these for now
if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']:
gu.addEquivalentClass(g, gene_id, fixedr)
# edge cases of id | symbol | chr | map_loc:
# 263 AMD1P2 X|Y with Xq28 and Yq12
# 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR
# 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies
# 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR
# 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility
# 101928066 LOC101928066 1|Un - # unlocated scaffold
#.........这里部分代码省略.........
示例5: _get_chrbands
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addGenome [as 别名]
def _get_chrbands(self, limit, taxon):
"""
For the given taxon, it will fetch the chr band file.
We will not deal with the coordinate information with this parser.
Here, we only are concerned with building the partonomy.
:param limit:
:return:
"""
line_counter = 0
myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
logger.info("Processing Chr bands from FILE: %s", myfile)
geno = Genotype(self.graph)
# build the organism's genome from the taxon
genome_label = self.files[taxon]['genome_label']
taxon_id = 'NCBITaxon:'+taxon
# add the taxon as a class. adding the class label elsewhere
self.gu.addClassToGraph(self.graph, taxon_id, None)
self.gu.addSynonym(self.graph, taxon_id, genome_label)
self.gu.loadObjectProperties(self.graph, Feature.object_properties)
genome_id = geno.makeGenomeID(taxon_id)
geno.addGenome(taxon_id, genome_label)
self.gu.addOWLPropertyClassRestriction(
self.graph, genome_id, Genotype.object_properties['in_taxon'],
taxon_id)
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match(r'^#', line):
continue
# chr13 4500000 10000000 p12 stalk
(chrom, start, stop, band, rtype) = line.split('\t')
line_counter += 1
# NOTE
# some less-finished genomes have placed and unplaced scaffolds
# * Placed scaffolds:
# Scaffold has an oriented location within a chromosome.
# * Unlocalized scaffolds:
# scaffold 's chromosome is known,
# scaffold's position, orientation or both is not known.
# *Unplaced scaffolds:
# it is not known which chromosome the scaffold belongs to.
# find out if the thing is a full on chromosome, or a scaffold:
# ex: unlocalized scaffold: chr10_KL568008v1_random
# ex: unplaced scaffold: chrUn_AABR07022428v1
placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)'
# TODO unused
# unlocalized_scaffold_pattern = \
# placed_scaffold_pattern + r'_(\w+)_random'
# unplaced_scaffold_pattern = r'chrUn_(\w+)'
m = re.match(placed_scaffold_pattern+r'$', chrom)
if m is not None and len(m.groups()) == 1:
# the chromosome is the first match of the pattern
# ch = m.group(1) # TODO unused
pass
else:
# let's skip over anything that isn't a placed_scaffold
# at the class level
logger.info("Skipping non-placed chromosome %s", chrom)
continue
# the chrom class, taxon as the reference
cclassid = makeChromID(chrom, taxon, 'CHR')
# add the chromosome as a class
geno.addChromosomeClass(chrom, taxon_id, genome_label)
self.gu.addOWLPropertyClassRestriction(
self.graph, cclassid,
self.gu.object_properties['member_of'], genome_id)
# add the band(region) as a class
maplocclass_id = cclassid+band
maplocclass_label = makeChromLabel(chrom+band, genome_label)
if band is not None and band.strip() != '':
region_type_id = self.map_type_of_region(rtype)
self.gu.addClassToGraph(
self.graph, maplocclass_id, maplocclass_label,
region_type_id)
else:
region_type_id = Feature.types['chromosome']
# add the staining intensity of the band
if re.match(r'g(neg|pos|var)', rtype):
if region_type_id in [
Feature.types['chromosome_band'],
Feature.types['chromosome_subband']]:
stain_type = Feature.types.get(rtype)
if stain_type is not None:
self.gu.addOWLPropertyClassRestriction(
self.graph, maplocclass_id,
Feature.properties['has_staining_intensity'],
#.........这里部分代码省略.........
示例6: _get_chrbands
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addGenome [as 别名]
def _get_chrbands(self, limit, taxon):
"""
:param limit:
:return:
"""
model = Model(self.graph)
# TODO PYLINT figure out what limit was for and why it is unused
line_counter = 0
myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
logger.info("Processing Chr bands from FILE: %s", myfile)
geno = Genotype(self.graph)
monochrom = Monochrom(self.graph_type, self.are_bnodes_skized)
# used to hold band definitions for a chr
# in order to compute extent of encompasing bands
mybands = {}
# build the organism's genome from the taxon
genome_label = self.files[taxon]['genome_label']
taxon_id = 'NCBITaxon:'+taxon
# add the taxon as a class. adding the class label elsewhere
model.addClassToGraph(taxon_id, None)
model.addSynonym(taxon_id, genome_label)
geno.addGenome(taxon_id, genome_label)
# add the build and the taxon it's in
build_num = self.files[taxon]['build_num']
build_id = 'UCSC:'+build_num
geno.addReferenceGenome(build_id, build_num, taxon_id)
# process the bands
with gzip.open(myfile, 'rb') as f:
for line in f:
# skip comments
line = line.decode().strip()
if re.match('^#', line):
continue
# chr13 4500000 10000000 p12 stalk
(scaffold, start, stop, band_num, rtype) = line.split('\t')
line_counter += 1
# NOTE some less-finished genomes have
# placed and unplaced scaffolds
# * Placed scaffolds:
# the scaffolds have been placed within a chromosome.
# * Unlocalized scaffolds:
# although the chromosome within which the scaffold occurs
# is known, the scaffold's position or orientation
# is not known.
# * Unplaced scaffolds:
# it is not known which chromosome the scaffold belongs to
#
# find out if the thing is a full on chromosome, or a scaffold:
# ex: unlocalized scaffold: chr10_KL568008v1_random
# ex: unplaced scaffold: chrUn_AABR07022428v1
placed_scaffold_pattern = r'(chr(?:\d+|X|Y|Z|W|M))'
unlocalized_scaffold_pattern = \
placed_scaffold_pattern+r'_(\w+)_random'
unplaced_scaffold_pattern = r'chr(Un(?:_\w+)?)'
m = re.match(placed_scaffold_pattern+r'$', scaffold)
if m is not None and len(m.groups()) == 1:
# the chromosome is the first match of the pattern
chrom_num = m.group(1)
else:
# skip over anything that isn't a placed_scaffold
# at the class level
logger.info("Found non-placed chromosome %s", scaffold)
chrom_num = None
m_chr_unloc = re.match(unlocalized_scaffold_pattern, scaffold)
m_chr_unplaced = re.match(unplaced_scaffold_pattern, scaffold)
scaffold_num = None
if m:
pass
elif m_chr_unloc is not None and\
len(m_chr_unloc.groups()) == 2:
chrom_num = m_chr_unloc.group(1)
scaffold_num = chrom_num+'_'+m_chr_unloc.group(2)
elif m_chr_unplaced is not None and\
len(m_chr_unplaced.groups()) == 1:
scaffold_num = m_chr_unplaced.group(1)
else:
logger.error(
"There's a chr pattern that we aren't matching: %s",
scaffold)
if chrom_num is not None:
# the chrom class (generic) id
chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')
# first, add the chromosome class (in the taxon)
geno.addChromosomeClass(
chrom_num, taxon_id, self.files[taxon]['genome_label'])
#.........这里部分代码省略.........
示例7: _get_gene_info
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addGenome [as 别名]
def _get_gene_info(self, limit):
"""
Currently loops through the gene_info file and
creates the genes as classes, typed with SO. It will add their label,
any alternate labels as synonyms, alternate ids as equivlaent classes.
HPRDs get added as protein products.
The chromosome and chr band get added as blank node regions,
and the gene is faldo:located
on the chr band.
:param limit:
:return:
"""
if self.testMode:
g = self.testgraph
else:
g = self.graph
geno = Genotype(g)
model = Model(g)
# not unzipping the file
logger.info("Processing 'Gene Info' records")
line_counter = 0
gene_info = '/'.join((self.rawdir, self.files['gene_info']['file']))
logger.info("FILE: %s", gene_info)
# Add taxa and genome classes for those in our filter
for tax_num in self.tax_ids:
tax_id = ':'.join(('NCBITaxon', str(tax_num)))
# tax label can get added elsewhere
geno.addGenome(tax_id, str(tax_num))
# label added elsewhere
model.addClassToGraph(tax_id, None)
with gzip.open(gene_info, 'rb') as f:
row = f.readline().decode().strip().split('\t')
logger.info("Header has %i columns", len(row))
for line in f:
# skip comments
line = line.decode().strip()
if re.match(r'^#', line):
continue
(tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom,
map_loc, desc, gtype, authority_symbol, name,
nomenclature_status, other_designations,
modification_date, feature_type) = line.split('\t')
# ##set filter=None in init if you don't want to have a filter
# if self.filter is not None:
# if ((self.filter == 'taxids' and \
# (int(tax_num) not in self.tax_ids))
# or (self.filter == 'geneids' and \
# (int(gene_num) not in self.gene_ids))):
# continue
# #### end filter
if self.testMode and int(gene_num) not in self.gene_ids:
continue
if not self.testMode and int(tax_num) not in self.tax_ids:
continue
line_counter += 1
gene_id = ':'.join(('NCBIGene', gene_num))
tax_id = ':'.join(('NCBITaxon', tax_num))
gene_type_id = self.map_type_of_gene(gtype.strip())
if symbol == 'NEWENTRY':
label = None
else:
label = symbol
# sequence feature, not a gene
if gene_type_id == 'SO:0000110':
self.class_or_indiv[gene_id] = 'I'
else:
self.class_or_indiv[gene_id] = 'C'
if not self.testMode and \
limit is not None and line_counter > limit:
continue
if self.class_or_indiv[gene_id] == 'C':
model.addClassToGraph(gene_id, label, gene_type_id, desc)
# NCBI will be the default leader,
# so we will not add the leader designation here.
else:
model.addIndividualToGraph(
gene_id, label, gene_type_id, desc)
# in this case, they aren't genes.
# so we want someone else to be the leader.
if name != '-':
model.addSynonym(gene_id, name)
if synonyms.strip() != '-':
for s in synonyms.split('|'):
model.addSynonym(
gene_id, s.strip(),
Assoc.annotation_properties['hasRelatedSynonym'])
if other_designations.strip() != '-':
for s in other_designations.split('|'):
#.........这里部分代码省略.........
示例8: _process_all
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addGenome [as 别名]
def _process_all(self, limit):
"""
This takes the list of omim identifiers from the omim.txt.Z file,
and iteratively queries the omim api for the json-formatted data.
This will create OMIM classes, with the label, definition, and some synonyms.
If an entry is "removed", it is added as a deprecated class.
If an entry is "moved", it is deprecated and consider annotations are added.
Additionally, we extract:
*phenotypicSeries ids as superclasses
*equivalent ids for Orphanet and UMLS
If set to testMode, it will write only those items in the test_ids to the testgraph.
:param limit:
:return:
"""
omimids = self._get_omim_ids() # store the set of omim identifiers
omimparams = {
'format': 'json',
'include': 'all',
}
# you will need to add the API key into the conf.json file, like:
# keys : { 'omim' : '<your api key here>' }
omimparams.update({'apiKey': config.get_config()['keys']['omim']})
# http://api.omim.org/api/entry?mimNumber=100100&include=all
if self.testMode:
g = self.testgraph
else:
g = self.graph
gu = GraphUtils(curie_map.get())
it = 0 # for counting
# note that you can only do request batches of 20
# see info about "Limits" at http://omim.org/help/api
groupsize = 20
if not self.testMode and limit is not None:
# just in case the limit is larger than the number of records, max it out
max = min((limit, omimids.__len__()))
else:
max = omimids.__len__()
# max = 10 #for testing
# TODO write the json to local files - make the assumption that downloads within 24 hrs are the same
# now, loop through the omim numbers and pull the records as json docs
while it < max:
end = min((max, it+groupsize))
# iterate through the omim ids list, and fetch from the OMIM api in batches of 20
if self.testMode:
intersect = list(set([str(i) for i in self.test_ids]) & set(omimids[it:end]))
if len(intersect) > 0: # some of the test ids are in the omimids
logger.info("found test ids: %s", intersect)
omimparams.update({'mimNumber': ','.join(intersect)})
else:
it += groupsize
continue
else:
omimparams.update({'mimNumber': ','.join(omimids[it:end])})
p = urllib.parse.urlencode(omimparams)
url = '/'.join((self.OMIM_API, 'entry'))+'?%s' % p
logger.info('fetching: %s', '/'.join((self.OMIM_API, 'entry'))+'?%s' % p)
# ### if you want to test a specific entry number, uncomment the following code block
# if ('101600' in omimids[it:end]): #104000
# print("FOUND IT in",omimids[it:end])
# else:
# #testing very specific record
# it+=groupsize
# continue
# ### end code block for testing
# print ('fetching:',(',').join(omimids[it:end]))
# print('url:',url)
d = urllib.request.urlopen(url)
resp = d.read().decode()
request_time = datetime.now()
it += groupsize
myjson = json.loads(resp)
entries = myjson['omim']['entryList']
geno = Genotype(g)
# add genome and taxon
tax_num = '9606'
tax_id = 'NCBITaxon:9606'
tax_label = 'Human'
geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere
gu.addClassToGraph(g, tax_id, None) # label added elsewhere
for e in entries:
#.........这里部分代码省略.........
示例9: _get_chrbands
# 需要导入模块: from dipper.models.Genotype import Genotype [as 别名]
# 或者: from dipper.models.Genotype.Genotype import addGenome [as 别名]
def _get_chrbands(self, limit, taxon):
"""
For the given taxon, it will fetch the chr band file.
We will not deal with the coordinate information with this parser.
Here, we only are concerned with building the partonomy.
:param limit:
:return:
"""
model = Model(self.graph)
line_counter = 0
myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
LOG.info("Processing Chr bands from FILE: %s", myfile)
geno = Genotype(self.graph)
# build the organism's genome from the taxon
genome_label = self.files[taxon]['genome_label']
taxon_id = 'NCBITaxon:' + taxon
# add the taxon as a class. adding the class label elsewhere
model.addClassToGraph(taxon_id, None)
model.addSynonym(taxon_id, genome_label)
genome_id = geno.makeGenomeID(taxon_id)
geno.addGenome(taxon_id, genome_label)
model.addOWLPropertyClassRestriction(
genome_id, self.globaltt['in taxon'], taxon_id)
placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)'
# currently unused patterns
# unlocalized_scaffold_pattern = placed_scaffold_pattern + r'_(\w+)_random'
# unplaced_scaffold_pattern = r'chrUn_(\w+)'
col = ['chrom', 'start', 'stop', 'band', 'rtype']
with gzip.open(myfile, 'rb') as reader:
for line in reader:
line_counter += 1
# skip comments
line = line.decode().strip()
if line[0] == '#':
continue
# chr13 4500000 10000000 p12 stalk
row = line.split('\t')
chrom = row[col.index('chrom')]
band = row[col.index('band')]
rtype = row[col.index('rtype')]
# NOTE
# some less-finished genomes have placed and unplaced scaffolds
# * Placed scaffolds:
# Scaffold has an oriented location within a chromosome.
# * Unlocalized scaffolds:
# scaffold 's chromosome is known,
# scaffold's position, orientation or both is not known.
# *Unplaced scaffolds:
# it is not known which chromosome the scaffold belongs to.
# find out if the thing is a full on chromosome, or a scaffold:
# ex: unlocalized scaffold: chr10_KL568008v1_random
# ex: unplaced scaffold: chrUn_AABR07022428v1
mch = re.match(placed_scaffold_pattern+r'$', chrom)
if mch is not None and len(mch.groups()) == 1:
# the chromosome is the first match of the pattern
# chrom = m.group(1) # TODO unused
pass
else:
# let's skip over anything that isn't a placed_scaffold
LOG.info("Skipping non-placed chromosome %s", chrom)
continue
# the chrom class, taxon as the reference
cclassid = makeChromID(chrom, taxon, 'CHR')
# add the chromosome as a class
geno.addChromosomeClass(chrom, taxon_id, genome_label)
model.addOWLPropertyClassRestriction(
cclassid, self.globaltt['member of'], genome_id)
# add the band(region) as a class
maplocclass_id = cclassid+band
maplocclass_label = makeChromLabel(chrom+band, genome_label)
if band is not None and band.strip() != '':
region_type_id = self.map_type_of_region(rtype)
model.addClassToGraph(
maplocclass_id, maplocclass_label,
region_type_id)
else:
region_type_id = self.globaltt['chromosome']
# add the staining intensity of the band
if re.match(r'g(neg|pos|var)', rtype):
if region_type_id in [
self.globaltt['chromosome_band'],
self.globaltt['chromosome_subband']]:
stain_type = self.resolve(rtype)
if stain_type is not None:
model.addOWLPropertyClassRestriction(
maplocclass_id,
self.globaltt['has_sequence_attribute'],
self.resolve(rtype))
else:
# usually happens if it's a chromosome because
# they don't actually have banding info
#.........这里部分代码省略.........