当前位置: 首页>>代码示例>>Python>>正文


Python MotifConfig.get_index_dir方法代码示例

本文整理汇总了Python中gimmemotifs.config.MotifConfig.get_index_dir方法的典型用法代码示例。如果您正苦于以下问题:Python MotifConfig.get_index_dir方法的具体用法?Python MotifConfig.get_index_dir怎么用?Python MotifConfig.get_index_dir使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gimmemotifs.config.MotifConfig的用法示例。


在下文中一共展示了MotifConfig.get_index_dir方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: check_threshold

# 需要导入模块: from gimmemotifs.config import MotifConfig [as 别名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 别名]
def check_threshold(outdir, genome, scoring="count"):
    # gimme_motifs config, to get defaults
    config = MotifConfig()
    
    threshold_file = None
    if scoring == "count":
        # Motif scanning threshold
        threshold_file = os.path.join(outdir, "threshold.{}.txt".format(genome))
        if not os.path.exists(threshold_file):
        # Random sequences from genome
            index_dir = os.path.join(config.get_index_dir(), genome)
            bg_file = os.path.join(outdir, "background.{}.fa".format(genome))
            if not os.path.exists(bg_file):
                m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER)
                m.writefasta(bg_file)
    
            pwmfile = config.get_default_params().get("motif_db")
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)
            
            cmd = "gimme threshold {} {} {} > {}".format(
                    pwmfile,
                    bg_file,
                    FDR,
                    threshold_file)
            sp.call(cmd, shell=True)
        return threshold_file
开发者ID:YichaoOU,项目名称:gimmemotifs,代码行数:28,代码来源:maelstrom.py

示例2: get_genome

# 需要导入模块: from gimmemotifs.config import MotifConfig [as 别名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 别名]
def get_genome(genomebuild, fastadir, indexdir=None):

    config = MotifConfig()
    if not indexdir:
        indexdir = config.get_index_dir()

    genome_dir = os.path.join(fastadir, genomebuild)
    index_dir = os.path.join(indexdir, genomebuild)

    
    # Check for rights to write to directory
    if not os.path.exists(genome_dir):
        try:
            os.mkdir(genome_dir)
        except OSError:
            sys.stderr.write("Could not create genome dir {}\n".format(genome_dir))
            sys.exit(1)

    # Download annotation
    gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild)
    download_annotation(genomebuild, gene_file)
    
    # Download genome FASTA file
    download_genome(genomebuild, genome_dir)

    sys.stderr.write("Creating index\n")
    g = GenomeIndex()
    g = g.create_index(genome_dir, index_dir)
    create_bedtools_fa(index_dir, genome_dir)
开发者ID:simonvh,项目名称:gimmemotifs,代码行数:31,代码来源:genome_index.py

示例3: __init__

# 需要导入模块: from gimmemotifs.config import MotifConfig [as 别名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 别名]
    def __init__(self, matchfile, genome="hg19", number=None):
        config = MotifConfig()
        index = os.path.join(config.get_index_dir(), genome)

        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name
        
        # Create bed-file with coordinates of random sequences
        matched_gc_bedfile(tmpbed, matchfile, genome, number)
        
        # Convert track to fasta
        track2fasta(index, tmpbed, tmpfasta)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
开发者ID:georgeg9,项目名称:gimmemotifs,代码行数:22,代码来源:background.py

示例4: matched_gc_bedfile

# 需要导入模块: from gimmemotifs.config import MotifConfig [as 别名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 别名]
def matched_gc_bedfile(bedfile, matchfile, genome, number):
    N_FRACTION = 0.1
    
    config = MotifConfig()
    index = os.path.join(config.get_index_dir(), genome)
    
    genome_size = os.path.join(index, "genome.size")
    genome_fa = os.path.join(index, "genome.fa")

    if not os.path.exists(genome_size) or not os.path.exists(genome_fa):
        raise RuntimeError, "genome files not found, please re-index {} "  \
                "with a recent version of gimme index".format(genome)

    try:
        fa = Fasta(matchfile)
        gc = [(seq.upper().count("C") + seq.upper().count("G")) / float(len(seq)) for seq in fa.seqs]
        lengths = [len(seq) for seq in fa.seqs]
    except:
        try:
            bed = pybedtools.BedTool(matchfile)
            gc = [float(x[4]) for x in bed.nucleotide_content(fi=genome_fa)]
            lengths = [x.length for x in bed]
        except:
            sys.stderr.write("Please provide input file in BED or FASTA format\n")
            sys.exit(1)

    gc_hist,bins = np.histogram(gc, range=(0,1), bins=20)
    
    length = np.median(lengths)
    if np.std(lengths) > length * 0.05:
        sys.stderr.write("Sequences do not seem to be of equal length.\n")
        sys.stderr.write("GC% matched sequences of the median length ({}) will be created\n".format(length))

    total = sum(gc_hist)
    
    if number:
        norm = number * gc_hist / (float(sum(gc_hist))) + 0.5
        inorm = norm.astype(np.int)

        s = np.argsort(norm - inorm)
        while sum(inorm) > number:
            if inorm[np.argmin(s)] > 0:
                inorm[np.argmin(s)] -= 1
            s[np.argmin(s)] = len(s)
        while sum(inorm) < number:
            inorm[np.argmax(s)] += 1
            s[np.argmax(s)] = 0
        gc_hist = inorm

    rnd = pybedtools.BedTool()
    out = open(bedfile, "w")
    #sys.stderr.write("Generating sequences\n")
    #sys.stderr.write("{}\n".format(number))
    
    r = rnd.random(l=length, n=number * 15, g=genome_size).nucleotide_content(fi=genome_fa)
   #sys.stderr.write("Retrieving\n")
    features = [f[:3] + [float(f[7])] for f in r if float(f[12]) <= length * N_FRACTION]
    gc = [f[3] for f in features]
    
    #sys.stderr.write("Done\n")
    for bin_start, bin_end, count in zip(bins[:-1], bins[1:], gc_hist):
        #sys.stderr.write("CG {}-{}\n".format(bin_start, bin_end))
        if count > 0:
            rcount = 0
            for f in features:
                if (f[3] >= bin_start and f[3] < bin_end):
                    out.write("{}\t{}\t{}\n".format(*f[:3]))
                    rcount += 1
                    if rcount >= count:
                        break

            if count != rcount:
                sys.stderr.write("not enough random sequences found for {} <= GC < {} ({} instead of {})\n".format(bin_start, bin_end, rcount, count))
    out.close()
开发者ID:georgeg9,项目名称:gimmemotifs,代码行数:76,代码来源:background.py

示例5: diff

# 需要导入模块: from gimmemotifs.config import MotifConfig [as 别名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 别名]
def diff(args):

    infiles = args.inputfiles.split(",")
    bgfile = args.bgfile
    outfile = args.outputfile
    pwmfile = args.pwmfile
    cutoff = args.cutoff
    genome = args.genome
    minenr = float(args.minenr)
    minfreq = float(args.minfreq)

    tmpdir = mkdtemp()
    
    # Retrieve FASTA clusters from BED file
    if len(infiles) == 1 and infiles[0].endswith("bed"):
        if not args.genome:
            sys.stderr.write("Can't convert BED file without genome!\n")
            sys.exit(1)

        clusters = {}
        for line in open(infiles[0]):
            vals = line.strip().split("\t")
            clusters.setdefault(vals[4], []).append(vals[:3])
        
        infiles = []
        
        config = MotifConfig()
        index_dir = config.get_index_dir()

        for cluster,regions in clusters.items():
            sys.stderr.write("Creating FASTA file for {0}\n".format(cluster))
            inbed = os.path.join(tmpdir, "{0}.bed".format(cluster))
            outfa = os.path.join(tmpdir, "{0}.fa".format(cluster))
            with open(inbed, "w") as f:
                for vals in regions:
                    f.write("{0}\t{1}\t{2}\n".format(*vals))
            track2fasta(os.path.join(index_dir, genome), inbed, outfa)
            infiles.append(outfa)
    
    pwms = dict([(m.id, m) for m in pwmfile_to_motifs(pwmfile)])
    motifs = [m for m in pwms.keys()]
    names = [os.path.basename(os.path.splitext(f)[0]) for f in infiles]
    
    # Get background frequencies
    nbg = float(len(Fasta(bgfile).seqs))
    bgcounts = get_counts(bgfile, pwms.values(), cutoff)
    bgfreq = [(bgcounts[m] + 0.01) / nbg for m in motifs]
    
    # Get frequences in input files
    freq = {}
    counts = {}
    for fname in infiles:
        c = get_counts(fname, pwms.values(), cutoff)
        n = float(len(Fasta(fname).seqs))
        freq[fname] = [(c[m] + 0.01) / n for m in motifs]
        counts[fname] = [c[m] for m in motifs]
    
    freq = np.array([freq[fname] for fname in infiles]).transpose()
    counts = np.array([counts[fname] for fname in infiles]).transpose()
    
    #for row in freq:
    #    print freq

    diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, minenr=minenr, minfreq=minfreq)

    shutil.rmtree(tmpdir)
开发者ID:georgeg9,项目名称:gimmemotifs,代码行数:68,代码来源:diff.py

示例6: GimmeMotifs

# 需要导入模块: from gimmemotifs.config import MotifConfig [as 别名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 别名]

#.........这里部分代码省略.........
                if len(vals) > 3:
                    try:
                        float(vals[3])
                    except ValueError:
                        pass
                        #self.logger.warn("No numerical value in column 4 on line %s of file %s, ignoring..." % (i + 1, file))

    def prepare_input_bed(self, inputfile, organism="hg18", width=200, fraction=0.2, abs_max=1000, use_strand=False):
        """ Create all the bed- and fasta-files necessary for motif prediction and validation """
        self.inputfile = inputfile

        width = int(width)
        fraction = float(fraction)
        abs_max = int(abs_max)
        use_strand = bool(use_strand)

        self.logger.info("preparing input (BED)")

        # Set all peaks to specific width
        self.logger.debug("Creating inputfile %s, width %s", self.input_bed, width)

    #    if not self.weird:
        write_equalwidth_bedfile(inputfile, width, self.input_bed)

        # Split input_bed in prediction and validation set
        self.logger.debug(
                "Splitting %s into prediction set (%s) and validation set (%s)",
                self.input_bed, self.prediction_bed, self.validation_bed)
        #if not self.weird:
        self.prediction_num, self.validation_num = divide_file(self.input_bed, self.prediction_bed, self.validation_bed, fraction, abs_max)


        # Make fasta files
        index_dir = os.path.join(self.config.get_index_dir(), organism)
        self.logger.debug("Creating %s", self.prediction_fa)

        genome_index.track2fasta(index_dir, self.prediction_bed, self.prediction_fa, use_strand=use_strand, ignore_missing=True)
        self.logger.debug("Creating %s", self.validation_fa)
        genome_index.track2fasta(index_dir, self.validation_bed, self.validation_fa, use_strand=use_strand, ignore_missing=True)

    def prepare_input_fa(self, inputfile, width=200, fraction=0.2, abs_max=1000):
        """ Create all the bed- and fasta-files necessary for motif prediction and validation """
        self.inputfile = inputfile

        width = int(width)
        fraction = float(fraction)
        abs_max = int(abs_max)

        self.logger.info("preparing input (FASTA)")

        # Split inputfile in prediction and validation set
        self.logger.debug(
                "Splitting %s into prediction set (%s) and validation set (%s)", 
                self.inputfile, self.prediction_fa, self.validation_fa)


        self.prediction_num, self.validation_num = divide_fa_file(self.inputfile, self.prediction_fa, self.validation_fa, fraction, abs_max)


    def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10):
        fg = Fasta(fafile)
        if bg_type == "random":
            if int(self.markov_model) >= 6:
                self.logger.warn("Are you sure about the Markov model? It seems too high!")
            else:
                order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)]
开发者ID:YichaoOU,项目名称:gimmemotifs,代码行数:70,代码来源:core.py

示例7: background

# 需要导入模块: from gimmemotifs.config import MotifConfig [as 别名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 别名]
def background(args):

    inputfile = args.inputfile
    out = args.outputfile
    bg_type = args.bg_type
    outformat = args.outformat.lower()
    length = args.length

    if not bg_type in BG_TYPES:
        print "The argument 'type' should be one of: %s" % (",".join(BG_TYPES))
        sys.exit(1)

    if outformat == "bed" and bg_type == "random":
        print "Random background can only be generated in FASTA format!"
        sys.exit(1)
        
    if bg_type == "gc" and not inputfile:
        print "need a FASTA formatted input file for background gc"
        sys.exit(1)
    
    # GimmeMotifs configuration for file and directory locations
    config = MotifConfig()
        
    # Genome index location for creation of FASTA files
    index_dir = os.path.join(config.get_index_dir(), args.genome)
    if bg_type in ["gc", "genomic", "promoter"] and outformat == "fasta":
        if not os.path.exists(index_dir):
            print "Index for %s does not exist. Has the genome been indexed for use with GimmeMotifs?" % args.genome
            sys.exit(1)
        
    # Gene definition
    gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % args.genome)
    if bg_type in ["promoter"]:
        if not os.path.exists(gene_file):
            print "Can't find gene definition for %s (%s). See GimmeMotifs documentation on how to add gene files." % (args.genome, gene_file)
            sys.exit(1)
    
    # Number of sequences
    number = None
    if args.number:
        number = args.number
    elif inputfile:
        number = number_of_seqs_in_file(inputfile)
    else:
        sys.stderr.write("please provide either a number or an inputfile\n")
        sys.exit(1)
    
    if bg_type == "random":
        f = Fasta(inputfile)
        m = bg.MarkovFasta(f, n=number, k=args.markov_order)
        m.writefasta(out)
    elif bg_type == "gc":
        if outformat in ["fasta", "fa"]:
            m = bg.MatchedGcFasta(inputfile, args.genome, number=number)
            m.writefasta(out)
        else:
            bg.matched_gc_bedfile(out, inputfile, args.genome, number)
    elif bg_type == "promoter":
        if outformat in ["fasta", "fa"]:
            m = bg.PromoterFasta(gene_file, index_dir, length=length, n=number)
            m.writefasta(out)
        else:
            bg.create_promoter_bedfile(out, gene_file, length, number)
    elif bg_type == "genomic":
        if outformat in ["fasta", "fa"]:
            m = bg.RandomGenomicFasta(index_dir, length, number)
            m.writefasta(out)
        else:
            bg.create_random_genomic_bedfile(out, index_dir, length, number)
开发者ID:PanosFirmpas,项目名称:gimmemotifs,代码行数:71,代码来源:background.py

示例8: Scanner

# 需要导入模块: from gimmemotifs.config import MotifConfig [as 别名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 别名]
class Scanner(object):
    """
    scan sequences with motifs
    """
    
    def __init__(self):
        self.config = MotifConfig()

        self.use_cache = False
        if self.config.get_default_params().get("use_cache", False):
            self._init_cache()
    
    def _init_cache(self):
        try:
            self.cache = make_region().configure(
                'dogpile.cache.pylibmc',
                expiration_time = 3600,
                arguments = {
                    'url':["127.0.0.1"],
                    'binary': True,
                    }
                #    'dogpile.cache.dbm',
                #    expiration_time = 3600,
                #    arguments = {
                #        'filename': 'cache.dbm'
                #    }
            )
            self.use_cache = True
        except Exception as e:
            sys.stderr.write("failed to initialize cache\n")
            sys.stderr.write("{}\n".format(e))

    def set_motifs(self, motifs):
        self.motifs = motifs
        self.motif_ids = [m.id for m in read_motifs(open(motifs))]
        self.checksum = {}
        if self.use_cache:
            chksum = CityHash64("\n".join(sorted(self.motif_ids)))
            self.checksum[self.motifs] = chksum

    def set_threshold(self):
        """
        cutoff  # based on motif matrix
        p-value # moods only
        FDR     # based on bg file
        """
        
        pass

    def set_genome(self, genome):
        """
        set the genome to be used for:
            - converting regions to sequences
            - background for MOODS
        """
        index_dir = os.path.join(self.config.get_index_dir(), genome)
        if not os.path.exists(index_dir) or not os.path.isdir(index_dir):
            raise ValueError("index for {} does not exist".format(genome))
        self.index_dir = index_dir
    
    def count(self, seqs, nreport=100, scan_rc=True, cutoff=0.95):
        """
        count the number of matches above the cutoff
        returns an iterator of lists containing integer counts
        """
        for matches in self.scan(seqs, nreport, scan_rc, cutoff):
            counts = [len(m) for m in matches]
            yield counts
     
    def total_count(self, seqs, nreport=100, scan_rc=True, cutoff=0.95):
        """
        count the number of matches above the cutoff
        returns an iterator of lists containing integer counts
        """
        
        count_table = [counts for counts in self.count(seqs, nreport, scan_rc, cutoff)]
        return np.sum(np.array(count_table), 0)

    def best_score(self, seqs, scan_rc=True):
        """
        give the score of the best match of each motif in each sequence
        returns an iterator of lists containing floats
        """
        for matches in self.scan(seqs, 1, scan_rc, cutoff=0):
            scores = [sorted(m, lambda x,y: 
                                    cmp(y[0], x[0])
                                    )[0][0] for m in matches]
            yield scores
 
    def best_match(self, seqs, scan_rc=True):
        """
        give the best match of each motif in each sequence
        returns an iterator of nested lists containing tuples:
        (score, position, strand)
        """
        for matches in self.scan(seqs, 1, scan_rc, cutoff=0):
            top = [sorted(m, lambda x,y: 
                                    cmp(y[0], x[0])
                                    )[0] for m in matches]
            yield top
#.........这里部分代码省略.........
开发者ID:YichaoOU,项目名称:gimmemotifs,代码行数:103,代码来源:scanner.py

示例9: get_genome

# 需要导入模块: from gimmemotifs.config import MotifConfig [as 别名]
# 或者: from gimmemotifs.config.MotifConfig import get_index_dir [as 别名]
def get_genome(genomebuild, fastadir, indexdir=None):

    config = MotifConfig()
    if not indexdir:
       indexdir = config.get_index_dir()

    genome_dir = os.path.join(fastadir, genomebuild)
    index_dir = os.path.join(indexdir, genomebuild)

    pred_bin = "genePredToBed"
    pred = find_executable(pred_bin)
    if not pred:
        sys.stderr.write("{} not found in path!\n".format(pred_bin))
        sys.exit(1)

    # Check for rights to write to directory
    if not os.path.exists(genome_dir):
        try:
            os.mkdir(genome_dir)
        except:
            sys.stderr.write("Could not create genome dir {}\n".format(genome_dir))
            sys.exit(1)

    # Download gene file based on URL + genomebuild
    gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild)
    tmp = NamedTemporaryFile(delete=False, suffix=".gz")

    anno = []
    f = urllib2.urlopen(UCSC_GENE_URL.format(genomebuild))
    p = re.compile(r'\w+.Gene.txt.gz')
    for line in f.readlines():
        m = p.search(line)
        if m:
            anno.append(m.group(0))

    sys.stderr.write("Retrieving gene annotation for {}\n".format(genomebuild))
    url = ""
    for a in ANNOS:
        if a in anno:
            url = UCSC_GENE_URL.format(genomebuild) + a
            break
    if url:
        urllib.urlretrieve(
                url,
                tmp.name
                )

        sp.call("zcat {} | cut -f2-11 | {} /dev/stdin {}".format(tmp.name, pred, gene_file), shell=True)

    else:
        sys.stderr.write("No annotation found!")

    # download genome based on URL + genomebuild
    sys.stderr.write("Downloading {} genome\n".format(genomebuild))
    for genome_url in [UCSC_GENOME_URL, ALT_UCSC_GENOME_URL]:

        remote = genome_url.format(genomebuild)

        genome_fa = os.path.join(
                genome_dir,
                os.path.split(remote)[-1]
                )

        sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild)))
        urllib.urlretrieve(
                genome_url.format(genomebuild),
                genome_fa
                )

        if not check_genome_file(genome_fa):
            os.unlink(genome_fa)
            continue

        break

    if not check_genome_file(genome_fa):
        sys.stderr.write("Failed to download genome\n")
        sys.exit(1)

    sys.stderr.write("Unpacking\n")
    if genome_fa.endswith("tar.gz"):
        cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa)
    else:
        cmd = "gunzip {0}".format(genome_fa)

    sp.call(cmd, shell=True, cwd=genome_dir)

    fa_files = glob("{}/*.fa".format(genome_dir))
    if len(fa_files) == 1:
        f = Fasta(fa_files[0])
        for n,s in f.items():
            with open("{}/{}.fa".format(genome_dir, n), "w") as f:
                f.write(">{}\n{}\n".format(n,s))

        os.unlink(fa_files[0])

    sys.stderr.write("Creating index\n")
    g = GenomeIndex()
    g = g.create_index(genome_dir, index_dir)

#.........这里部分代码省略.........
开发者ID:YichaoOU,项目名称:gimmemotifs,代码行数:103,代码来源:genome_index.py


注:本文中的gimmemotifs.config.MotifConfig.get_index_dir方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。