本文整理匯總了Python中gimmemotifs.config.MotifConfig.get_index_dir方法的典型用法代碼示例。如果您正苦於以下問題:Python MotifConfig.get_index_dir方法的具體用法?Python MotifConfig.get_index_dir怎麽用?Python MotifConfig.get_index_dir使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類gimmemotifs.config.MotifConfig
的用法示例。
在下文中一共展示了MotifConfig.get_index_dir方法的9個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: check_threshold
# 需要導入模塊: from gimmemotifs.config import MotifConfig [as 別名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 別名]
def check_threshold(outdir, genome, scoring="count"):
# gimme_motifs config, to get defaults
config = MotifConfig()
threshold_file = None
if scoring == "count":
# Motif scanning threshold
threshold_file = os.path.join(outdir, "threshold.{}.txt".format(genome))
if not os.path.exists(threshold_file):
# Random sequences from genome
index_dir = os.path.join(config.get_index_dir(), genome)
bg_file = os.path.join(outdir, "background.{}.fa".format(genome))
if not os.path.exists(bg_file):
m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER)
m.writefasta(bg_file)
pwmfile = config.get_default_params().get("motif_db")
pwmfile = os.path.join(config.get_motif_dir(), pwmfile)
cmd = "gimme threshold {} {} {} > {}".format(
pwmfile,
bg_file,
FDR,
threshold_file)
sp.call(cmd, shell=True)
return threshold_file
示例2: get_genome
# 需要導入模塊: from gimmemotifs.config import MotifConfig [as 別名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 別名]
def get_genome(genomebuild, fastadir, indexdir=None):
config = MotifConfig()
if not indexdir:
indexdir = config.get_index_dir()
genome_dir = os.path.join(fastadir, genomebuild)
index_dir = os.path.join(indexdir, genomebuild)
# Check for rights to write to directory
if not os.path.exists(genome_dir):
try:
os.mkdir(genome_dir)
except OSError:
sys.stderr.write("Could not create genome dir {}\n".format(genome_dir))
sys.exit(1)
# Download annotation
gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild)
download_annotation(genomebuild, gene_file)
# Download genome FASTA file
download_genome(genomebuild, genome_dir)
sys.stderr.write("Creating index\n")
g = GenomeIndex()
g = g.create_index(genome_dir, index_dir)
create_bedtools_fa(index_dir, genome_dir)
示例3: __init__
# 需要導入模塊: from gimmemotifs.config import MotifConfig [as 別名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 別名]
def __init__(self, matchfile, genome="hg19", number=None):
config = MotifConfig()
index = os.path.join(config.get_index_dir(), genome)
# Create temporary files
tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name
# Create bed-file with coordinates of random sequences
matched_gc_bedfile(tmpbed, matchfile, genome, number)
# Convert track to fasta
track2fasta(index, tmpbed, tmpfasta)
# Initialize super Fasta object
Fasta.__init__(self, tmpfasta)
# Delete the temporary files
os.remove(tmpbed)
os.remove(tmpfasta)
示例4: matched_gc_bedfile
# 需要導入模塊: from gimmemotifs.config import MotifConfig [as 別名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 別名]
def matched_gc_bedfile(bedfile, matchfile, genome, number):
N_FRACTION = 0.1
config = MotifConfig()
index = os.path.join(config.get_index_dir(), genome)
genome_size = os.path.join(index, "genome.size")
genome_fa = os.path.join(index, "genome.fa")
if not os.path.exists(genome_size) or not os.path.exists(genome_fa):
raise RuntimeError, "genome files not found, please re-index {} " \
"with a recent version of gimme index".format(genome)
try:
fa = Fasta(matchfile)
gc = [(seq.upper().count("C") + seq.upper().count("G")) / float(len(seq)) for seq in fa.seqs]
lengths = [len(seq) for seq in fa.seqs]
except:
try:
bed = pybedtools.BedTool(matchfile)
gc = [float(x[4]) for x in bed.nucleotide_content(fi=genome_fa)]
lengths = [x.length for x in bed]
except:
sys.stderr.write("Please provide input file in BED or FASTA format\n")
sys.exit(1)
gc_hist,bins = np.histogram(gc, range=(0,1), bins=20)
length = np.median(lengths)
if np.std(lengths) > length * 0.05:
sys.stderr.write("Sequences do not seem to be of equal length.\n")
sys.stderr.write("GC% matched sequences of the median length ({}) will be created\n".format(length))
total = sum(gc_hist)
if number:
norm = number * gc_hist / (float(sum(gc_hist))) + 0.5
inorm = norm.astype(np.int)
s = np.argsort(norm - inorm)
while sum(inorm) > number:
if inorm[np.argmin(s)] > 0:
inorm[np.argmin(s)] -= 1
s[np.argmin(s)] = len(s)
while sum(inorm) < number:
inorm[np.argmax(s)] += 1
s[np.argmax(s)] = 0
gc_hist = inorm
rnd = pybedtools.BedTool()
out = open(bedfile, "w")
#sys.stderr.write("Generating sequences\n")
#sys.stderr.write("{}\n".format(number))
r = rnd.random(l=length, n=number * 15, g=genome_size).nucleotide_content(fi=genome_fa)
#sys.stderr.write("Retrieving\n")
features = [f[:3] + [float(f[7])] for f in r if float(f[12]) <= length * N_FRACTION]
gc = [f[3] for f in features]
#sys.stderr.write("Done\n")
for bin_start, bin_end, count in zip(bins[:-1], bins[1:], gc_hist):
#sys.stderr.write("CG {}-{}\n".format(bin_start, bin_end))
if count > 0:
rcount = 0
for f in features:
if (f[3] >= bin_start and f[3] < bin_end):
out.write("{}\t{}\t{}\n".format(*f[:3]))
rcount += 1
if rcount >= count:
break
if count != rcount:
sys.stderr.write("not enough random sequences found for {} <= GC < {} ({} instead of {})\n".format(bin_start, bin_end, rcount, count))
out.close()
示例5: diff
# 需要導入模塊: from gimmemotifs.config import MotifConfig [as 別名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 別名]
def diff(args):
infiles = args.inputfiles.split(",")
bgfile = args.bgfile
outfile = args.outputfile
pwmfile = args.pwmfile
cutoff = args.cutoff
genome = args.genome
minenr = float(args.minenr)
minfreq = float(args.minfreq)
tmpdir = mkdtemp()
# Retrieve FASTA clusters from BED file
if len(infiles) == 1 and infiles[0].endswith("bed"):
if not args.genome:
sys.stderr.write("Can't convert BED file without genome!\n")
sys.exit(1)
clusters = {}
for line in open(infiles[0]):
vals = line.strip().split("\t")
clusters.setdefault(vals[4], []).append(vals[:3])
infiles = []
config = MotifConfig()
index_dir = config.get_index_dir()
for cluster,regions in clusters.items():
sys.stderr.write("Creating FASTA file for {0}\n".format(cluster))
inbed = os.path.join(tmpdir, "{0}.bed".format(cluster))
outfa = os.path.join(tmpdir, "{0}.fa".format(cluster))
with open(inbed, "w") as f:
for vals in regions:
f.write("{0}\t{1}\t{2}\n".format(*vals))
track2fasta(os.path.join(index_dir, genome), inbed, outfa)
infiles.append(outfa)
pwms = dict([(m.id, m) for m in pwmfile_to_motifs(pwmfile)])
motifs = [m for m in pwms.keys()]
names = [os.path.basename(os.path.splitext(f)[0]) for f in infiles]
# Get background frequencies
nbg = float(len(Fasta(bgfile).seqs))
bgcounts = get_counts(bgfile, pwms.values(), cutoff)
bgfreq = [(bgcounts[m] + 0.01) / nbg for m in motifs]
# Get frequences in input files
freq = {}
counts = {}
for fname in infiles:
c = get_counts(fname, pwms.values(), cutoff)
n = float(len(Fasta(fname).seqs))
freq[fname] = [(c[m] + 0.01) / n for m in motifs]
counts[fname] = [c[m] for m in motifs]
freq = np.array([freq[fname] for fname in infiles]).transpose()
counts = np.array([counts[fname] for fname in infiles]).transpose()
#for row in freq:
# print freq
diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, minenr=minenr, minfreq=minfreq)
shutil.rmtree(tmpdir)
示例6: GimmeMotifs
# 需要導入模塊: from gimmemotifs.config import MotifConfig [as 別名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 別名]
#.........這裏部分代碼省略.........
if len(vals) > 3:
try:
float(vals[3])
except ValueError:
pass
#self.logger.warn("No numerical value in column 4 on line %s of file %s, ignoring..." % (i + 1, file))
def prepare_input_bed(self, inputfile, organism="hg18", width=200, fraction=0.2, abs_max=1000, use_strand=False):
""" Create all the bed- and fasta-files necessary for motif prediction and validation """
self.inputfile = inputfile
width = int(width)
fraction = float(fraction)
abs_max = int(abs_max)
use_strand = bool(use_strand)
self.logger.info("preparing input (BED)")
# Set all peaks to specific width
self.logger.debug("Creating inputfile %s, width %s", self.input_bed, width)
# if not self.weird:
write_equalwidth_bedfile(inputfile, width, self.input_bed)
# Split input_bed in prediction and validation set
self.logger.debug(
"Splitting %s into prediction set (%s) and validation set (%s)",
self.input_bed, self.prediction_bed, self.validation_bed)
#if not self.weird:
self.prediction_num, self.validation_num = divide_file(self.input_bed, self.prediction_bed, self.validation_bed, fraction, abs_max)
# Make fasta files
index_dir = os.path.join(self.config.get_index_dir(), organism)
self.logger.debug("Creating %s", self.prediction_fa)
genome_index.track2fasta(index_dir, self.prediction_bed, self.prediction_fa, use_strand=use_strand, ignore_missing=True)
self.logger.debug("Creating %s", self.validation_fa)
genome_index.track2fasta(index_dir, self.validation_bed, self.validation_fa, use_strand=use_strand, ignore_missing=True)
def prepare_input_fa(self, inputfile, width=200, fraction=0.2, abs_max=1000):
""" Create all the bed- and fasta-files necessary for motif prediction and validation """
self.inputfile = inputfile
width = int(width)
fraction = float(fraction)
abs_max = int(abs_max)
self.logger.info("preparing input (FASTA)")
# Split inputfile in prediction and validation set
self.logger.debug(
"Splitting %s into prediction set (%s) and validation set (%s)",
self.inputfile, self.prediction_fa, self.validation_fa)
self.prediction_num, self.validation_num = divide_fa_file(self.inputfile, self.prediction_fa, self.validation_fa, fraction, abs_max)
def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10):
fg = Fasta(fafile)
if bg_type == "random":
if int(self.markov_model) >= 6:
self.logger.warn("Are you sure about the Markov model? It seems too high!")
else:
order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)]
示例7: background
# 需要導入模塊: from gimmemotifs.config import MotifConfig [as 別名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 別名]
def background(args):
inputfile = args.inputfile
out = args.outputfile
bg_type = args.bg_type
outformat = args.outformat.lower()
length = args.length
if not bg_type in BG_TYPES:
print "The argument 'type' should be one of: %s" % (",".join(BG_TYPES))
sys.exit(1)
if outformat == "bed" and bg_type == "random":
print "Random background can only be generated in FASTA format!"
sys.exit(1)
if bg_type == "gc" and not inputfile:
print "need a FASTA formatted input file for background gc"
sys.exit(1)
# GimmeMotifs configuration for file and directory locations
config = MotifConfig()
# Genome index location for creation of FASTA files
index_dir = os.path.join(config.get_index_dir(), args.genome)
if bg_type in ["gc", "genomic", "promoter"] and outformat == "fasta":
if not os.path.exists(index_dir):
print "Index for %s does not exist. Has the genome been indexed for use with GimmeMotifs?" % args.genome
sys.exit(1)
# Gene definition
gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % args.genome)
if bg_type in ["promoter"]:
if not os.path.exists(gene_file):
print "Can't find gene definition for %s (%s). See GimmeMotifs documentation on how to add gene files." % (args.genome, gene_file)
sys.exit(1)
# Number of sequences
number = None
if args.number:
number = args.number
elif inputfile:
number = number_of_seqs_in_file(inputfile)
else:
sys.stderr.write("please provide either a number or an inputfile\n")
sys.exit(1)
if bg_type == "random":
f = Fasta(inputfile)
m = bg.MarkovFasta(f, n=number, k=args.markov_order)
m.writefasta(out)
elif bg_type == "gc":
if outformat in ["fasta", "fa"]:
m = bg.MatchedGcFasta(inputfile, args.genome, number=number)
m.writefasta(out)
else:
bg.matched_gc_bedfile(out, inputfile, args.genome, number)
elif bg_type == "promoter":
if outformat in ["fasta", "fa"]:
m = bg.PromoterFasta(gene_file, index_dir, length=length, n=number)
m.writefasta(out)
else:
bg.create_promoter_bedfile(out, gene_file, length, number)
elif bg_type == "genomic":
if outformat in ["fasta", "fa"]:
m = bg.RandomGenomicFasta(index_dir, length, number)
m.writefasta(out)
else:
bg.create_random_genomic_bedfile(out, index_dir, length, number)
示例8: Scanner
# 需要導入模塊: from gimmemotifs.config import MotifConfig [as 別名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 別名]
class Scanner(object):
"""
scan sequences with motifs
"""
def __init__(self):
self.config = MotifConfig()
self.use_cache = False
if self.config.get_default_params().get("use_cache", False):
self._init_cache()
def _init_cache(self):
try:
self.cache = make_region().configure(
'dogpile.cache.pylibmc',
expiration_time = 3600,
arguments = {
'url':["127.0.0.1"],
'binary': True,
}
# 'dogpile.cache.dbm',
# expiration_time = 3600,
# arguments = {
# 'filename': 'cache.dbm'
# }
)
self.use_cache = True
except Exception as e:
sys.stderr.write("failed to initialize cache\n")
sys.stderr.write("{}\n".format(e))
def set_motifs(self, motifs):
self.motifs = motifs
self.motif_ids = [m.id for m in read_motifs(open(motifs))]
self.checksum = {}
if self.use_cache:
chksum = CityHash64("\n".join(sorted(self.motif_ids)))
self.checksum[self.motifs] = chksum
def set_threshold(self):
"""
cutoff # based on motif matrix
p-value # moods only
FDR # based on bg file
"""
pass
def set_genome(self, genome):
"""
set the genome to be used for:
- converting regions to sequences
- background for MOODS
"""
index_dir = os.path.join(self.config.get_index_dir(), genome)
if not os.path.exists(index_dir) or not os.path.isdir(index_dir):
raise ValueError("index for {} does not exist".format(genome))
self.index_dir = index_dir
def count(self, seqs, nreport=100, scan_rc=True, cutoff=0.95):
"""
count the number of matches above the cutoff
returns an iterator of lists containing integer counts
"""
for matches in self.scan(seqs, nreport, scan_rc, cutoff):
counts = [len(m) for m in matches]
yield counts
def total_count(self, seqs, nreport=100, scan_rc=True, cutoff=0.95):
"""
count the number of matches above the cutoff
returns an iterator of lists containing integer counts
"""
count_table = [counts for counts in self.count(seqs, nreport, scan_rc, cutoff)]
return np.sum(np.array(count_table), 0)
def best_score(self, seqs, scan_rc=True):
"""
give the score of the best match of each motif in each sequence
returns an iterator of lists containing floats
"""
for matches in self.scan(seqs, 1, scan_rc, cutoff=0):
scores = [sorted(m, lambda x,y:
cmp(y[0], x[0])
)[0][0] for m in matches]
yield scores
def best_match(self, seqs, scan_rc=True):
"""
give the best match of each motif in each sequence
returns an iterator of nested lists containing tuples:
(score, position, strand)
"""
for matches in self.scan(seqs, 1, scan_rc, cutoff=0):
top = [sorted(m, lambda x,y:
cmp(y[0], x[0])
)[0] for m in matches]
yield top
#.........這裏部分代碼省略.........
示例9: get_genome
# 需要導入模塊: from gimmemotifs.config import MotifConfig [as 別名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 別名]
def get_genome(genomebuild, fastadir, indexdir=None):
config = MotifConfig()
if not indexdir:
indexdir = config.get_index_dir()
genome_dir = os.path.join(fastadir, genomebuild)
index_dir = os.path.join(indexdir, genomebuild)
pred_bin = "genePredToBed"
pred = find_executable(pred_bin)
if not pred:
sys.stderr.write("{} not found in path!\n".format(pred_bin))
sys.exit(1)
# Check for rights to write to directory
if not os.path.exists(genome_dir):
try:
os.mkdir(genome_dir)
except:
sys.stderr.write("Could not create genome dir {}\n".format(genome_dir))
sys.exit(1)
# Download gene file based on URL + genomebuild
gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild)
tmp = NamedTemporaryFile(delete=False, suffix=".gz")
anno = []
f = urllib2.urlopen(UCSC_GENE_URL.format(genomebuild))
p = re.compile(r'\w+.Gene.txt.gz')
for line in f.readlines():
m = p.search(line)
if m:
anno.append(m.group(0))
sys.stderr.write("Retrieving gene annotation for {}\n".format(genomebuild))
url = ""
for a in ANNOS:
if a in anno:
url = UCSC_GENE_URL.format(genomebuild) + a
break
if url:
urllib.urlretrieve(
url,
tmp.name
)
sp.call("zcat {} | cut -f2-11 | {} /dev/stdin {}".format(tmp.name, pred, gene_file), shell=True)
else:
sys.stderr.write("No annotation found!")
# download genome based on URL + genomebuild
sys.stderr.write("Downloading {} genome\n".format(genomebuild))
for genome_url in [UCSC_GENOME_URL, ALT_UCSC_GENOME_URL]:
remote = genome_url.format(genomebuild)
genome_fa = os.path.join(
genome_dir,
os.path.split(remote)[-1]
)
sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild)))
urllib.urlretrieve(
genome_url.format(genomebuild),
genome_fa
)
if not check_genome_file(genome_fa):
os.unlink(genome_fa)
continue
break
if not check_genome_file(genome_fa):
sys.stderr.write("Failed to download genome\n")
sys.exit(1)
sys.stderr.write("Unpacking\n")
if genome_fa.endswith("tar.gz"):
cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa)
else:
cmd = "gunzip {0}".format(genome_fa)
sp.call(cmd, shell=True, cwd=genome_dir)
fa_files = glob("{}/*.fa".format(genome_dir))
if len(fa_files) == 1:
f = Fasta(fa_files[0])
for n,s in f.items():
with open("{}/{}.fa".format(genome_dir, n), "w") as f:
f.write(">{}\n{}\n".format(n,s))
os.unlink(fa_files[0])
sys.stderr.write("Creating index\n")
g = GenomeIndex()
g = g.create_index(genome_dir, index_dir)
#.........這裏部分代碼省略.........