本文整理汇总了Python中CGAT.Database类的典型用法代码示例。如果您正苦于以下问题:Python Database类的具体用法?Python Database怎么用?Python Database使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Database类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: loadHypergeometricAnalysis
def loadHypergeometricAnalysis(infile, outfile):
'''load GO results.'''
track = P.toTable(outfile)
tablename = 'hypergeometric_%s_summary' % track
P.load(infile, outfile, tablename=tablename)
dbh = connect()
ontologies = [x[0] for x in Database.executewait(
dbh,
'''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()]
genelists = [x[0] for x in Database.executewait(
dbh,
'''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()]
# output files from runGO.py
sections = ('results', 'parameters', 'withgenes')
for section in sections:
tablename = 'hypergeometric_%s_%s' % (track, section)
statement = '''
python %(scriptsdir)s/combine_tables.py
--cat=track
--regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
| python %(scriptsdir)s/csv2db.py
%(csv2db_options)s
--table=%(tablename)s
>> %(outfile)s'''
P.run()
for ontology in ontologies:
fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)
if not os.path.exists(fn):
E.warn("file %s does not exist" % fn)
continue
P.load(fn,
outfile,
tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
options='--allow-empty')
fn = os.path.join(
infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology)
P.load(fn,
outfile,
tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
options='--allow-empty')
fn = os.path.join(
infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology)
P.load(fn,
outfile,
tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
options='--allow-empty')
示例2: createViewMapping
def createViewMapping(infile, outfile):
"""create view in database for alignment stats.
This view aggregates all information on a per-track basis.
The table is built from the following tracks:
mapping_stats
bam_stats
"""
tablename = P.toTable(outfile)
# can not create views across multiple database, so use table
view_type = "TABLE"
dbhandle = connect()
Database.executewait(dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals())
statement = """
CREATE %(view_type)s %(tablename)s AS
SELECT *
FROM bam_stats AS b
"""
Database.executewait(dbhandle, statement % locals())
示例3: ReadGene2GOFromDatabase
def ReadGene2GOFromDatabase(dbhandle, go_type, database, species):
"""read go assignments from ensembl database.
returns a dictionary of lists.
(one to many mapping of genes to GO categories)
and a dictionary of go-term to go information
Note: assumes that external_db_id for GO is 1000
"""
statement = GetGOStatement(go_type, database, species)
result = Database.executewait(dbhandle, statement,
retries=0).fetchall()
gene2go = {}
go2info = collections.defaultdict(GOInfo)
for gene_id, goid, description, evidence in result:
gm = GOMatch(goid, go_type, description, evidence)
gi = GOInfo(goid, go_type, description)
if gene_id not in gene2go:
gene2go[gene_id] = []
gene2go[gene_id].append(gm)
go2info[goid] = gi
return gene2go, go2info
示例4: importCodingPotential
def importCodingPotential( infile, outfile ):
'''import annotations'''
table = outfile[:-len(".import")]
statement = '''
python %(scriptsdir)s/csv2db.py %(csv2db_options)s
--allow-empty
--index=gene_id
--map=gene_id:str
--table=%(table)s
< %(infile)s
> %(outfile)s'''
P.run()
# set the is_coding flag
dbhandle = sqlite3.connect( PARAMS["database"] )
Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals())
Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (f_iscoding == 'coding') OR (r_iscoding == 'coding') ''' % locals())
dbhandle.commit()
示例5: mergeEffectsPerGene
def mergeEffectsPerGene( infile, outfile ):
'''summarize effects on a per-gene level.'''
tablename = outfile[:-len(".load")]
dbhandle = connect()
statement = '''
CREATE TABLE %(tablename)s AS
SELECT DISTINCT
track,
gene_id,
COUNT(*) AS ntranscripts,
MIN(e.nalleles) AS min_nalleles,
MAX(e.nalleles) AS max_nalleles,
MIN(e.stop_min) AS min_stop_min,
MAX(e.stop_min) AS max_stop_min,
MIN(e.stop_max) AS min_stop_max,
MAX(e.stop_max) AS max_stop_max,
SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1
ELSE 0 END) AS nmd_knockout,
SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1
ELSE 0 END) AS nmd_affected
FROM annotations.transcript_info as i, effects AS e
WHERE i.transcript_id = e.transcript_id
GROUP BY i.gene_id, track
''' % locals()
Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals() )
Database.executewait( dbhandle, statement )
Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals())
dbhandle.commit()
P.touch(outfile)
示例6: loadCodingPotential
def loadCodingPotential( infile, outfile ):
'''load annotations'''
table = P.toTable( outfile )
statement = '''
gunzip < %(infile)s
| python %(scriptsdir)s/csv2db.py
%(csv2db_options)s
--allow-empty
--index=gene_id
--map=gene_id:str
--table=%(table)s
> %(outfile)s'''
P.run()
# set the is_coding flag
dbhandle = sqlite3.connect( PARAMS["database"] )
Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals())
Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals())
dbhandle.commit()
示例7: DumpGOFromDatabase
def DumpGOFromDatabase(outfile,
dbhandle,
options):
"""read go assignments from database.
and dump them into a flatfile.
(one to many mapping of genes to GO categories)
and a dictionary of go-term to go information
"""
E.info("category\ttotal\tgenes\tcategories")
all_genes = collections.defaultdict(int)
all_categories = collections.defaultdict(int)
all_ntotal = 0
outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")
for go_type in options.ontology:
genes = collections.defaultdict(int)
categories = collections.defaultdict(int)
ntotal = 0
statement = GetGOStatement(go_type, options.database_name,
options.species)
results = Database.executewait(
dbhandle, statement, retries=0).fetchall()
for result in results:
outfile.write("\t".join(map(str, (go_type,) + result)) + "\n")
gene_id, goid, description, evidence = result
genes[gene_id] += 1
categories[goid] += 1
ntotal += 1
all_genes[gene_id] += 1
all_categories[goid] += 1
all_ntotal += 1
E.info("%s\t%i\t%i\t%i" % (go_type, ntotal,
len(genes),
len(categories)))
E.info("%s\t%i\t%i\t%i" % ("all",
all_ntotal,
len(all_genes),
len(all_categories)))
return
示例8: loadCuffdiff
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0):
'''load results from cuffdiff analysis to database
This functions parses and loads the results of a cuffdiff differential
expression analysis.
Parsing is performed by the parseCuffdiff function.
Multiple tables will be created as cuffdiff outputs information
on gene, isoform, tss, etc. levels.
The method converts from ln(fold change) to log2 fold change.
Pairwise comparisons in which one gene is not expressed (fpkm <
`min_fpkm`) are set to status 'NOCALL'. These transcripts might
nevertheless be significant.
Arguments
---------
dbhandle : object
Database handle.
infile : string
Input filename, output from cuffdiff
outfile : string
Output filename in :term:`tsv` format.
min_fpkm : float
Minimum fpkm. Genes with an fpkm lower than this will
be set to status `NOCALL`.
'''
prefix = P.toTable(outfile)
indir = infile + ".dir"
if not os.path.exists(indir):
P.touch(outfile)
return
# E.info( "building cummeRbund database" )
# R('''library(cummeRbund)''')
# cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' )
# to be continued...
tmpname = P.getTempFilename(shared=True)
# ignore promoters and splicing - no fold change column, but sqrt(JS)
for fn, level in (("cds_exp.diff.gz", "cds"),
("gene_exp.diff.gz", "gene"),
("isoform_exp.diff.gz", "isoform"),
# ("promoters.diff.gz", "promotor"),
# ("splicing.diff.gz", "splice"),
("tss_group_exp.diff.gz", "tss")):
tablename = prefix + "_" + level + "_diff"
infile = os.path.join(indir, fn)
results = parseCuffdiff(infile, min_fpkm=min_fpkm)
Expression.writeExpressionResults(tmpname, results)
P.load(tmpname, outfile,
tablename=tablename,
options="--allow-empty-file "
"--add-index=treatment_name "
"--add-index=control_name "
"--add-index=test_id")
for fn, level in (("cds.fpkm_tracking.gz", "cds"),
("genes.fpkm_tracking.gz", "gene"),
("isoforms.fpkm_tracking.gz", "isoform"),
("tss_groups.fpkm_tracking.gz", "tss")):
tablename = prefix + "_" + level + "_levels"
infile = os.path.join(indir, fn)
P.load(infile, outfile,
tablename=tablename,
options="--allow-empty-file "
"--add-index=tracking_id "
"--add-index=control_name "
"--add-index=test_id")
# Jethro - load tables of sample specific cuffdiff fpkm values into csvdb
# IMS: First read in lookup table for CuffDiff/Pipeline sample name
# conversion
inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz"))
inf.readline()
sample_lookup = {}
for line in inf:
line = line.split("\t")
our_sample_name = IOTools.snip(line[0])
our_sample_name = re.sub("-", "_", our_sample_name)
cuffdiff_sample_name = "%s_%s" % (line[1], line[2])
sample_lookup[cuffdiff_sample_name] = our_sample_name
inf.close()
for fn, level in (("cds.read_group_tracking.gz", "cds"),
("genes.read_group_tracking.gz", "gene"),
("isoforms.read_group_tracking.gz", "isoform"),
("tss_groups.read_group_tracking.gz", "tss")):
#.........这里部分代码省略.........
示例9: buildExpressionStats
def buildExpressionStats(
dbhandle,
outfile,
tablenames,
outdir,
regex_table="(?P<design>[^_]+)_"
"(?P<geneset>[^_]+)_"
"(?P<counting_method>[^_]+)_"
"(?P<method>[^_]+)_"
"(?P<level>[^_]+)_diff"):
"""compile expression summary statistics from database.
This method outputs a table with the number of genes tested,
failed, differentially expressed, etc. for a series of DE tests.
Arguments
---------
dbhandle : object
Database handle.
tables : list
List of tables to process.
outfile : string
Output filename in :term:`tsv` format.
outdir : string
Output directory for diagnostic plots.
regex : string
Regular expression to extract experimental information
from table name.
"""
keys_status = "OK", "NOTEST", "FAIL", "NOCALL"
outf = IOTools.openFile(outfile, "w")
outf.write("\t".join(
("design",
"geneset",
"level",
"counting_method",
"treatment_name",
"control_name",
"tested",
"\t".join(["status_%s" % x for x in keys_status]),
"significant",
"twofold")) + "\n")
for tablename in tablenames:
r = re.search(regex_table, tablename)
if r is None:
raise ValueError(
"can't match tablename '%s' to regex" % tablename)
geneset = r.group("geneset")
design = r.group("design")
level = r.group("level")
counting_method = r.group("counting_method")
geneset = r.group("geneset")
def toDict(vals, l=2):
return collections.defaultdict(
int,
[(tuple(x[:l]), x[l]) for x in vals])
tested = toDict(Database.executewait(
dbhandle,
"SELECT treatment_name, control_name, "
"COUNT(*) FROM %(tablename)s "
"GROUP BY treatment_name,control_name" % locals()
).fetchall())
status = toDict(Database.executewait(
dbhandle,
"SELECT treatment_name, control_name, status, "
"COUNT(*) FROM %(tablename)s "
"GROUP BY treatment_name,control_name,status"
% locals()).fetchall(), 3)
signif = toDict(Database.executewait(
dbhandle,
"SELECT treatment_name, control_name, "
"COUNT(*) FROM %(tablename)s "
"WHERE significant "
"GROUP BY treatment_name,control_name" % locals()
).fetchall())
fold2 = toDict(Database.executewait(
dbhandle,
"SELECT treatment_name, control_name, "
"COUNT(*) FROM %(tablename)s "
"WHERE (l2fold >= 1 or l2fold <= -1) AND significant "
"GROUP BY treatment_name,control_name,significant"
% locals()).fetchall())
for treatment_name, control_name in tested.keys():
outf.write("\t".join(map(str, (
design,
geneset,
level,
counting_method,
treatment_name,
control_name,
tested[(treatment_name, control_name)],
"\t".join(
[str(status[(treatment_name, control_name, x)])
#.........这里部分代码省略.........
示例10: loadCuffdiff
#.........这里部分代码省略.........
inf.close()
for fn, level in (("cds.read_group_tracking.gz", "cds"),
("genes.read_group_tracking.gz", "gene"),
("isoforms.read_group_tracking.gz", "isoform"),
("tss_groups.read_group_tracking.gz", "tss")):
tablename = prefix + "_" + level + "sample_fpkms"
tmpf = P.getTempFilename(".")
inf = IOTools.openFile(os.path.join(indir, fn)).readlines()
outf = IOTools.openFile(tmpf, "w")
samples = []
genes = {}
x = 0
for line in inf:
if x == 0:
x += 1
continue
line = line.split()
gene_id = line[0]
condition = line[1]
replicate = line[2]
fpkm = line[6]
status = line[8]
sample_id = condition + "_" + replicate
if sample_id not in samples:
samples.append(sample_id)
# IMS: The following block keeps getting its indenting messed
# up. It is not part of the 'if sample_id not in samples' block
# plesae make sure it does not get made part of it
if gene_id not in genes:
genes[gene_id] = {}
genes[gene_id][sample_id] = fpkm
else:
if sample_id in genes[gene_id]:
raise ValueError(
'sample_id %s appears twice in file for gene_id %s'
% (sample_id, gene_id))
else:
if status != "OK":
genes[gene_id][sample_id] = status
else:
genes[gene_id][sample_id] = fpkm
samples = sorted(samples)
# IMS - CDS files might be empty if not cds has been
# calculated for the genes in the long term need to add CDS
# annotation to denovo predicted genesets in meantime just
# skip if cds tracking file is empty
if len(samples) == 0:
continue
headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples])
outf.write(headers + "\n")
for gene in genes.iterkeys():
outf.write(gene + "\t")
x = 0
while x < len(samples) - 1:
outf.write(genes[gene][samples[x]] + "\t")
x += 1
# IMS: Please be careful with this line. It keeps getting moved
# into the above while block where it does not belong
outf.write(genes[gene][samples[len(samples) - 1]] + "\n")
outf.close()
statement = ("cat %(tmpf)s |"
" python %(scriptsdir)s/csv2db.py "
" %(csv2db_options)s"
" --allow-empty-file"
" --add-index=gene_id"
" --table=%(tablename)s"
" >> %(outfile)s.log")
P.run()
os.unlink(tmpf)
# build convenience table with tracks
tablename = prefix + "_isoform_levels"
tracks = Database.getColumnNames(dbhandle, tablename)
tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")]
tmpfile = P.getTempFile(dir=".")
tmpfile.write("track\n")
tmpfile.write("\n".join(tracks) + "\n")
tmpfile.close()
P.load(tmpfile.name, outfile)
os.unlink(tmpfile.name)
示例11: buildExpressionStats
def buildExpressionStats(tables, method, outfile, outdir):
'''build expression summary statistics.
Creates also diagnostic plots in
<exportdir>/<method> directory.
'''
dbhandle = sqlite3.connect(PARAMS["database"])
def _split(tablename):
# this would be much easier, if feature_counts/gene_counts/etc.
# would not contain an underscore.
try:
design, geneset, counting_method = re.match(
"([^_]+)_vs_([^_]+)_(.*)_%s" % method,
tablename).groups()
except AttributeError:
try:
design, geneset = re.match(
"([^_]+)_([^_]+)_%s" % method,
tablename).groups()
counting_method = "na"
except AttributeError:
raise ValueError("can't parse tablename %s" % tablename)
return design, geneset, counting_method
# return re.match("([^_]+)_", tablename ).groups()[0]
keys_status = "OK", "NOTEST", "FAIL", "NOCALL"
outf = IOTools.openFile(outfile, "w")
outf.write("\t".join(
("design",
"geneset",
"level",
"treatment_name",
"counting_method",
"control_name",
"tested",
"\t".join(["status_%s" % x for x in keys_status]),
"significant",
"twofold")) + "\n")
all_tables = set(Database.getTables(dbhandle))
for level in CUFFDIFF_LEVELS:
for tablename in tables:
tablename_diff = "%s_%s_diff" % (tablename, level)
tablename_levels = "%s_%s_diff" % (tablename, level)
design, geneset, counting_method = _split(tablename_diff)
if tablename_diff not in all_tables:
continue
def toDict(vals, l=2):
return collections.defaultdict(
int,
[(tuple(x[:l]), x[l]) for x in vals])
tested = toDict(
Database.executewait(
dbhandle,
"SELECT treatment_name, control_name, "
"COUNT(*) FROM %(tablename_diff)s "
"GROUP BY treatment_name,control_name" % locals()
).fetchall())
status = toDict(Database.executewait(
dbhandle,
"SELECT treatment_name, control_name, status, "
"COUNT(*) FROM %(tablename_diff)s "
"GROUP BY treatment_name,control_name,status"
% locals()).fetchall(), 3)
signif = toDict(Database.executewait(
dbhandle,
"SELECT treatment_name, control_name, "
"COUNT(*) FROM %(tablename_diff)s "
"WHERE significant "
"GROUP BY treatment_name,control_name" % locals()
).fetchall())
fold2 = toDict(Database.executewait(
dbhandle,
"SELECT treatment_name, control_name, "
"COUNT(*) FROM %(tablename_diff)s "
"WHERE (l2fold >= 1 or l2fold <= -1) AND significant "
"GROUP BY treatment_name,control_name,significant"
% locals()).fetchall())
for treatment_name, control_name in tested.keys():
outf.write("\t".join(map(str, (
design,
geneset,
level,
counting_method,
treatment_name,
control_name,
tested[(treatment_name, control_name)],
"\t".join(
#.........这里部分代码省略.........
示例12: importLincRNA
def importLincRNA( infile, outfile ):
'''build a linc RNA set.
* no coding potential
* unknown and intergenic transcripts
* no overlap with ``linc_exclude`` (usually: human refseq)
* at least ``linc_min_length`` bp in length
* at least ``linc_min_reads`` reads in transcript
'''
table = outfile[:-len(".import")]
track = table[:-len("Linc")]
dbhandle = sqlite3.connect( PARAMS["database"] )
Database.executewait( dbhandle, '''DROP TABLE IF EXISTS %(table)s''' % locals())
Database.executewait( dbhandle, '''CREATE TABLE %(table)s (gene_id TEXT)''' % locals())
Database.executewait( dbhandle, '''CREATE INDEX %(table)s_index1 ON %(table)s (gene_id)''' % locals())
joins, wheres = [], ["1"]
if PARAMS["linc_min_reads"] > 0:
joins.append( ", %(track)s_coverage as cov" % locals() )
wheres.append( "cov.gene_id = m.gene_id2 AND cov.nmatches >= %(i)" % PARAMS["linc_min_reads"] )
if PARAMS["linc_exclude"] > 0:
joins.append( "LEFT JOIN %s_vs_%s_ovl as ovl on ovl.gene_id2 = a.gene_id" %\
(PARAMS["linc_exclude"], track ) )
wheres.append( "ovl.gene_id1 IS NULL" )
wheres = " AND ".join( wheres )
joins = " ".join( joins )
statement = '''INSERT INTO %(table)s
SELECT DISTINCT(a.gene_id) FROM
%(track)s_annotation as a
%(joins)s
LEFT JOIN %(track)s_coding AS c on c.gene_id = a.gene_id
WHERE is_unknown
AND is_intergenic
AND exons_sum >= %(linc_min_length)i
AND (c.is_coding IS NULL or not c.is_coding)
AND %(wheres)s
''' % dict( PARAMS.items() + locals().items() )
E.debug( "statement to build lincRNA: %s" % statement)
Database.executewait( dbhandle, statement % locals())
dbhandle.commit()
cc = dbhandle.cursor()
result = cc.execute("SELECT COUNT(*) FROM %(table)s" % locals() ).fetchall()[0][0]
E.info( "build lincRNA set for %s: %i entries" % ( track, result ))
outgtf = "%s.gtf.gz" % table
E.info( "creating gtf file `%s`" % outgtf )
# output gtf file
statement = '''%(cmd-sql)s %(database)s "SELECT g.* FROM %(track)s_gtf as g, %(table)s AS t
WHERE t.gene_id = g.gene_id"
| python %(scriptsdir)s/gtf2tsv.py --invert --log=%(outfile)s
| gzip
> %(outgtf)s'''
P.run()
示例13: loadSummary
def loadSummary( infile, outfile ):
'''load several rates into a single convenience table.
'''
stmt_select = []
stmt_from = []
stmt_where = ["1"]
track = infile[:-len(".gtf.gz")]
tablename = "%s_evol" % track
if os.path.exists( "%s_rates.load" % track ):
stmt_select.append( "a.distance AS ks, a.aligned AS aligned" )
stmt_from.append('''LEFT JOIN %(track)s_rates AS a
ON r.gene_id = a.gene_id AND
a.aligned >= %(rates_min_aligned)i AND
a.distance <= %(rates_max_rate)f''' )
if os.path.exists( "%s_coverage.load" % track ):
stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage" )
stmt_from.append("LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id" )
if os.path.exists( "%s_repeats_gc.load" % track ):
stmt_select.append("ar_gc.exons_mean AS repeats_gc" )
stmt_from.append("LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id" )
if os.path.exists( "%s_repeats_rates.load" % track ):
stmt_select.append("ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska" )
stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar
ON r.gene_id = ar.gene_id AND
ar.exons_nval >= %(rates_min_repeats)i''' )
if os.path.exists( "%s_introns_rates.load" % track ):
stmt_select.append("ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski" )
stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir
ON r.gene_id = ir.gene_id AND
ir.aligned >= %(rates_min_aligned)i''' )
x = locals()
x.update( PARAMS )
stmt_select = ", ".join( stmt_select ) % x
stmt_from = " ".join( stmt_from ) % x
stmt_where = " AND ".join( stmt_where ) % x
dbhandle = sqlite3.connect( PARAMS["database"] )
Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals() )
statement = '''
CREATE TABLE %(tablename)s AS
SELECT
CAST(r.gene_id AS TEXT) AS gene_id,
r.exons_sum as length,
r.exons_pGC as pgc,
%(stmt_select)s
FROM
%(track)s_annotation AS r
%(stmt_from)s
WHERE %(stmt_where)s
''' % locals()
Database.executewait( dbhandle, statement)
dbhandle.commit()
P.touch(outfile)
示例14: buildDMRStats
def buildDMRStats(tables, method, outfile, dbhandle):
"""build dmr summary statistics.
This method counts the number of up/down, 2fold up/down, etc.
genes in output from (:mod:`scripts/runExpression`).
This method also creates diagnostic plots in the
<exportdir>/<method> directory.
Tables should be labeled <tileset>_<design>_<method>.
Arguments
---------
tables ; list
List of tables with DMR output
method : string
Method name
outfile : string
Output filename. Tab separated file summarizing
"""
def togeneset(tablename):
return re.match("([^_]+)_", tablename).groups()[0]
keys_status = "OK", "NOTEST", "FAIL", "NOCALL"
outf = IOTools.openFile(outfile, "w")
outf.write(
"\t".join(
(
"tileset",
"design",
"track1",
"track2",
"tested",
"\t".join(["status_%s" % x for x in keys_status]),
"significant",
"up",
"down",
"twofold",
"twofold_up",
"twofold_down",
)
)
+ "\n"
)
all_tables = set(Database.getTables(dbhandle))
outdir = os.path.join(PARAMS["exportdir"], "diff_methylation")
for tablename in tables:
prefix = P.snip(tablename, "_%s" % method)
tileset, design = prefix.split("_")
def toDict(vals, l=2):
return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals])
E.info("collecting data from %s" % tablename)
tested = toDict(
Database.executewait(
dbhandle,
"""SELECT treatment_name, control_name, COUNT(*)
FROM %(tablename)s
GROUP BY treatment_name,control_name"""
% locals(),
).fetchall()
)
status = toDict(
Database.executewait(
dbhandle,
"""SELECT treatment_name, control_name, status,
COUNT(*) FROM %(tablename)s
GROUP BY treatment_name,control_name,status"""
% locals(),
).fetchall(),
3,
)
signif = toDict(
Database.executewait(
dbhandle,
"""SELECT treatment_name, control_name,
COUNT(*) FROM %(tablename)s
WHERE significant
GROUP BY treatment_name,control_name"""
% locals(),
).fetchall()
)
fold2 = toDict(
Database.executewait(
dbhandle,
"""SELECT treatment_name, control_name,
COUNT(*) FROM %(tablename)s
WHERE (l2fold >= 1 or l2fold <= -1) AND significant
GROUP BY treatment_name,control_name,significant"""
% locals(),
).fetchall()
)
#.........这里部分代码省略.........
示例15: createView
def createView(dbhandle, tables, tablename, outfile,
view_type="TABLE",
ignore_duplicates=True):
'''create a database view for a list of tables.
This method performs a join across multiple tables and stores the
result either as a view or a table in the database.
Arguments
---------
dbhandle :
A database handle.
tables : list of tuples
Tables to merge. Each tuple contains the name of a table and
the field to join with the first table. For example::
tables = (
"reads_summary", "track",
"bam_stats", "track",
"context_stats", "track",
"picard_stats_alignment_summary_metrics", "track")
tablename : string
Name of the view or table to be created.
outfile : string
Output filename for status information.
view_type : string
Type of view, either ``VIEW`` or ``TABLE``. If a view is to be
created across multiple databases, use ``TABLE``.
ignore_duplicates : bool
If set to False, duplicate column names will be added with the
tablename as prefix. The default is to ignore.
'''
Database.executewait(
dbhandle,
"DROP %(view_type)s IF EXISTS %(tablename)s" % locals())
tracks, columns = [], []
tablenames = [x[0] for x in tables]
for table, track in tables:
d = Database.executewait(
dbhandle,
"SELECT COUNT(DISTINCT %s) FROM %s" % (track, table))
tracks.append(d.fetchone()[0])
columns.append(
[x.lower() for x in Database.getColumnNames(dbhandle, table)
if x != track])
E.info("creating %s from the following tables: %s" %
(tablename, str(list(zip(tablenames, tracks)))))
if min(tracks) != max(tracks):
raise ValueError(
"number of rows not identical - will not create view")
from_statement = " , ".join(
["%s as t%i" % (y[0], x) for x, y in enumerate(tables)])
f = tables[0][1]
where_statement = " AND ".join(
["t0.%s = t%i.%s" % (f, x + 1, y[1])
for x, y in enumerate(tables[1:])])
all_columns, taken = [], set()
for x, c in enumerate(columns):
i = set(taken).intersection(set(c))
if i:
E.warn("duplicate column names: %s " % i)
if not ignore_duplicates:
table = tables[x][0]
all_columns.extend(
["t%i.%s AS %s_%s" % (x, y, table, y) for y in i])
c = [y for y in c if y not in i]
all_columns.extend(["t%i.%s" % (x, y) for y in c])
taken.update(set(c))
all_columns = ",".join(all_columns)
statement = '''
CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s
FROM %(from_statement)s
WHERE %(where_statement)s
''' % locals()
Database.executewait(dbhandle, statement)
nrows = Database.executewait(
dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0]
if nrows == 0:
raise ValueError(
"empty view mapping, check statement = %s" %
(statement % locals()))
if nrows != min(tracks):
E.warn("view creates duplicate rows, got %i, expected %i" %
(nrows, min(tracks)))
E.info("created view_mapping with %i rows" % nrows)
touchFile(outfile)