当前位置: 首页>>代码示例>>Python>>正文


Python Grouper.join方法代码示例

本文整理汇总了Python中jcvi.utils.grouper.Grouper.join方法的典型用法代码示例。如果您正苦于以下问题:Python Grouper.join方法的具体用法?Python Grouper.join怎么用?Python Grouper.join使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在jcvi.utils.grouper.Grouper的用法示例。


在下文中一共展示了Grouper.join方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: group

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b in ac.iter_pairs():
            groups.join(a, b)

    ngroups = len(groups)
    nmembers = sum(len(x) for x in groups)
    logging.debug("Created {0} groups with {1} members.".\
                  format(ngroups, nmembers))

    for g in groups:
        print ",".join(sorted(g))
开发者ID:bennyyu,项目名称:jcvi,代码行数:28,代码来源:synteny.py

示例2: iter_partitions

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
    def iter_partitions(self, cutoff=.3, gtr=True):
        from jcvi.utils.grouper import Grouper

        if gtr:
            names = self.gnames
            fp = open(self.gtrfile)
        else:
            names = self.anames
            fp = open(self.atrfile)

        reader = csv.reader(fp, delimiter="\t")
        grouper = Grouper()
        for g in map(GTRLine._make, reader):
            d = float(g.dist)
            if d < cutoff:
                continue

            grouper.join(g.parent, g.left_child, g.right_child)

        parents = {}
        for i, group in enumerate(grouper):
            for g in group:
                parents[g] = i

        partitions = [[parents.get(a, x), x] for a, x in names]
        for key, parts in groupby(partitions, key=lambda x: x[0]):
            yield list(x[1] for x in parts)
开发者ID:Hensonmw,项目名称:jcvi,代码行数:29,代码来源:cdt.py

示例3: pile

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def pile(args):
    """
    %prog pile abedfile bbedfile > piles

    Call intersectBed on two bedfiles.
    """
    from jcvi.utils.grouper import Grouper

    p = OptionParser(pile.__doc__)
    p.add_option("--minOverlap", default=0, type="int",
                 help="Minimum overlap required [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    abedfile, bbedfile = args
    iw = intersectBed_wao(abedfile, bbedfile, minOverlap=opts.minOverlap)
    groups = Grouper()
    for a, b in iw:
        groups.join(a.accn, b.accn)

    ngroups = 0
    for group in groups:
        if len(group) > 1:
            ngroups += 1
            print "|".join(group)

    logging.debug("A total of {0} piles (>= 2 members)".format(ngroups))
开发者ID:linlifeng,项目名称:jcvi,代码行数:31,代码来源:bed.py

示例4: get_2D_overlap

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def get_2D_overlap(chain, eclusters):
    """
    Implements a sweep line algorithm, that has better running time than naive O(n^2):
    assume block has x_ends, and y_ends for the bounds

    1. sort x_ends, and take a sweep line to scan the x_ends
    2. if left end, test y-axis intersection of current block with `active` set;
       also put this block in the `active` set
    3. if right end, remove block from the `active` set
    """
    mergeables = Grouper()
    active = set()

    x_ends = []
    for i, (range_x, range_y, score) in enumerate(eclusters):
        chr, left, right = range_x
        x_ends.append((chr, left, 0, i))  # 0/1 for left/right-ness
        x_ends.append((chr, right, 1, i))
    x_ends.sort()

    chr_last = ""
    for chr, pos, left_right, i in x_ends:
        if chr != chr_last: active.clear()
        if left_right == 0:
            active.add(i)
            for x in active:
                # check y-overlap
                if range_overlap(eclusters[x][1], eclusters[i][1]):
                    mergeables.join(x, i)
        else: # right end
            active.remove(i)

        chr_last = chr

    return mergeables
开发者ID:rrane,项目名称:jcvi,代码行数:37,代码来源:quota.py

示例5: fuse

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def fuse(args):
    """
    %prog fuse *.bed *.anchors

    Fuse gene orders based on anchors file.
    """
    from jcvi.algorithms.graph import BiGraph

    p = OptionParser(fuse.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    bedfiles = [x for x in args if x.endswith(".bed")]
    anchorfiles = [x for x in args if x.endswith(".anchors")]

    # TODO: Use Markov clustering to sparsify the edges
    families = Grouper()
    for anchorfile in anchorfiles:
        af = AnchorFile(anchorfile)
        for a, b, block_id in af.iter_pairs():
            families.join(a, b)

    allowed = set(families.keys())
    logging.debug("Total families: {}, Gene members: {}"
                  .format(len(families), len(allowed)))

    # TODO: Use C++ implementation of BiGraph() when available
    # For now just serialize this to the disk
    G = BiGraph()
    for bedfile in bedfiles:
        bed = Bed(bedfile, include=allowed)
        #add_bed_to_graph(G, bed, families)
        print_edges(G, bed, families)
开发者ID:tanghaibao,项目名称:jcvi,代码行数:37,代码来源:reconstruct.py

示例6: group

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b, idx in ac.iter_pairs():
            groups.join(a, b)

    logging.debug("Created {0} groups with {1} members.".\
                  format(len(groups), groups.num_members))

    outfile = opts.outfile
    fw = must_open(outfile, "w")
    for g in groups:
        print >> fw, ",".join(sorted(g))
    fw.close()

    return outfile
开发者ID:Hensonmw,项目名称:jcvi,代码行数:33,代码来源:catalog.py

示例7: find_synteny_region

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def find_synteny_region(query, sbed, data, window, cutoff, colinear=False):
    """
    Get all synteny blocks for a query, algorithm is single linkage
    anchors are a window centered on query

    Two categories of syntenic regions depending on what query is:
    (Syntelog): syntenic region is denoted by the syntelog
    (Gray gene): syntenic region is marked by the closest flanker
    """
    regions = []
    ysorted = sorted(data, key=lambda x: x[1])
    g = Grouper()

    a, b = tee(ysorted)
    next(b, None)
    for ia, ib in izip(a, b):
        pos1, pos2 = ia[1], ib[1]
        if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid:
            g.join(ia, ib)

    for group in sorted(g):
        (qflanker, syntelog), (far_flanker, far_syntelog), flanked = \
            get_flanker(group, query)

        # run a mini-dagchainer here, take the direction that gives us most anchors
        if colinear:
            y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)]
            lis = longest_increasing_subsequence(y_indexed_group)
            lds = longest_decreasing_subsequence(y_indexed_group)

            if len(lis) >= len(lds):
                track = lis
                orientation = "+"
            else:
                track = lds
                orientation = "-"

            group = [group[i] for (y, i) in track]

        xpos, ypos = zip(*group)
        score = min(len(set(xpos)), len(set(ypos)))

        if qflanker == query:
            gray = "S"
        else:
            gray = "G" if not flanked else "F"
            score -= 1  # slight penalty for not finding syntelog

        if score < cutoff:
            continue

        # y-boundary of the block
        left, right = group[0][1], group[-1][1]
        # this characterizes a syntenic region (left, right).
        # syntelog is -1 if it's a gray gene
        syn_region = (syntelog, far_syntelog, left,
                      right, gray, orientation, score)
        regions.append(syn_region)

    return sorted(regions, key=lambda x: -x[-1])  # decreasing synteny score
开发者ID:tanghaibao,项目名称:jcvi,代码行数:62,代码来源:synfind.py

示例8: tandem_grouper

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True):
    if not flip:
        simple_blast = [(b.query, (b.sseqid, b.si)) for b in blast_list if b.evalue < 1e-10]
    else:
        simple_blast = [(b.subject, (b.qseqid, b.qi)) for b in blast_list if b.evalue < 1e-10]

    simple_blast.sort()

    standems = Grouper()
    for name, hits in groupby(simple_blast, key=lambda x: x[0]):
        # these are already sorted.
        hits = [x[1] for x in hits]
        for ia, a in enumerate(hits[:-1]):
            b = hits[ia + 1]
            # on the same chr and rank difference no larger than tandem_Nmax
            if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]:
                standems.join(a[1], b[1])

    return standems
开发者ID:arvin580,项目名称:jcvi,代码行数:21,代码来源:blastfilter.py

示例9: main

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def main(blast_file, cds_file, bed_file, N=3):

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file).order

    # filter the blast file
    g = Grouper()
    fp = open(blast_file)
    for row in fp:
        b = BlastLine(row)
        query_len = sizes[b.query]
        subject_len = sizes[b.subject]
        if b.hitlen < min(query_len, subject_len) / 2:
            continue

        query, subject = gene_name(b.query), gene_name(b.subject)
        qi, q = bed[query]
        si, s = bed[subject]

        if q.seqid == s.seqid and abs(qi - si) <= N:
            g.join(query, subject)

    # dump the grouper
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
        ",".join(longest_family))
开发者ID:bennyyu,项目名称:jcvi,代码行数:45,代码来源:tandem.py

示例10: chain_HSPs

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def chain_HSPs(blastlines, xdist=100, ydist=100):
    """
    Take a list of BlastLines (or a BlastSlow instance), and returns a list of
    BlastLines.
    """
    key = lambda x: (x.query, x.subject)
    blastlines.sort(key=key)

    clusters = Grouper()
    for qs, points in groupby(blastlines, key=key):
        points = sorted(list(points), \
                key=lambda x: (x.qstart, x.qstop, x.sstart, x.sstop))

        n = len(points)
        for i in xrange(n):
            a = points[i]
            clusters.join(a)
            for j in xrange(i + 1, n):
                b = points[j]
                if a.orientation != b.orientation:
                    continue

                # x-axis distance
                del_x = get_distance(a, b)
                if del_x > xdist:
                    continue
                # y-axis distance
                del_y = get_distance(a, b, xaxis=False)
                if del_y > ydist:
                    continue
                # otherwise join
                clusters.join(a, b)

    chained_hsps = []
    for c in clusters:
        chained_hsps.append(combine_HSPs(c))
    chained_hsps = sorted(chained_hsps, key=lambda x: -x.score)

    return chained_hsps
开发者ID:bennyyu,项目名称:jcvi,代码行数:41,代码来源:blast.py

示例11: chain_HSPs

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def chain_HSPs(blast, xdist=100, ydist=100):
    """
    Take a list of BlastLines (or a BlastSlow instance), and returns a list of
    BlastLines.
    """
    key = lambda x: (x.query, x.subject)
    blast.sort(key=key)

    clusters = Grouper()
    for qs, points in groupby(blast, key=key):
        points = sorted(list(points), \
                key=lambda x: (x.qstart, x.qstop, x.sstart, x.sstop))

        n = len(points)
        for i in xrange(n):
            a = points[i]
            clusters.join(a)
            for j in xrange(i + 1, n):
                b = points[j]

                # x-axis distance
                del_x = get_distance(a, b)
                if del_x > xdist:
                    break
                # y-axis distance
                del_y = get_distance(a, b, xaxis=False)
                if del_y > ydist:
                    continue
                # otherwise join
                clusters.join(a, b)

    chained_hsps = [combine_HSPs(x) for x in clusters]
    key = lambda x: (x.query, -x.score if x.has_score else 0)
    chained_hsps = sorted(chained_hsps, key=key)

    return chained_hsps
开发者ID:tanghaibao,项目名称:jcvi,代码行数:38,代码来源:blast.py

示例12: tandem_main

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
    evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):

    if genefam:
        N = 1e5

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file)
    order = bed.order

    if is_self:
        # filter the blast file
        g = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len)*P/100.:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            qi, q = order[query]
            si, s = order[subject]

            if abs(qi - si) <= N and b.evalue <= evalue:
                if genefam:
                    g.join(query, subject)
                elif q.seqid == s.seqid:
                    g.join(query, subject)

    else:
        homologs = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len)*P/100.:
                continue
            if b.evalue > evalue:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            homologs.join(query, subject)

        if genefam:
            g = homologs
        else:
            g = Grouper()
            for i, atom in enumerate(bed):
                for x in range(1, N+1):
                    if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
                        homologs.joined(bed[i-x].accn, atom.accn)]):
                        leni = sizes[bed[i].accn]
                        lenx = sizes[bed[i-x].accn]
                        if abs(leni - lenx) > max(leni, lenx)*(1-P/100.):
                            continue
                        g.join(bed[i-x].accn, atom.accn)

    # dump the grouper
    fw = must_open(ofile, "w")
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print >>fw, ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
        ",".join(longest_family))

    return families
开发者ID:Hensonmw,项目名称:jcvi,代码行数:88,代码来源:catalog.py

示例13: path

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def path(args):
    """
    %prog path input.bed scaffolds.fasta

    Construct golden path given a set of genetic maps. The respective weight for
    each map is given in file `weights.txt`. The map with the highest weight is
    considered the pivot map. The final output is an AGP file that contains
    ordered scaffolds.
    """
    oargs = args
    p = OptionParser(path.__doc__)
    p.add_option("-b", "--bedfile", help=SUPPRESS_HELP)
    p.add_option("-s", "--fastafile", help=SUPPRESS_HELP)
    p.add_option("-w", "--weightsfile", default="weights.txt",
                 help="Use weights from file")
    p.add_option("--distance", default="rank", choices=distance_choices,
                 help="Distance function when building initial consensus")
    p.add_option("--linkage", default="double", choices=linkage_choices,
                 help="Linkage function when building initial consensus")
    p.add_option("--gapsize", default=100, type="int",
                 help="Insert gaps of size between scaffolds")
    p.add_option("--ngen", default=500, type="int",
                 help="Iterations in GA, more ~ slower")
    p.add_option("--npop", default=100, type="int",
                 help="Population size in GA, more ~ slower")
    p.add_option("--seqid", help="Only run partition with this seqid")
    p.add_option("--links", default=10, type="int",
                 help="Only plot matchings more than")
    p.add_option("--noplot", default=False, action="store_true",
                 help="Do not visualize the alignments")
    p.set_cpus(cpus=16)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, fastafile = args
    inputbed = opts.bedfile or inputbed
    fastafile = opts.fastafile or fastafile
    pf = inputbed.rsplit(".", 1)[0]
    bedfile = pf + ".bed"
    weightsfile = opts.weightsfile
    gapsize = opts.gapsize
    ngen = opts.ngen
    npop = opts.npop
    cpus = opts.cpus
    if sys.version_info[:2] < (2, 7):
        logging.debug("Python version: {0}. CPUs set to 1.".\
                        format(sys.version.splitlines()[0].strip()))
        cpus = 1

    function = get_function(opts.distance)
    cc = Map(bedfile, function)
    mapnames = cc.mapnames
    allseqids = cc.seqids
    weights = Weights(weightsfile, mapnames)
    pivot = weights.pivot
    ref = weights.ref
    linkage = opts.linkage
    oseqid = opts.seqid
    logging.debug("Linkage function: {0}-linkage".format(linkage))
    linkage = {"single": min, "double": double_linkage, "complete": max,
               "average": np.mean, "median": np.median}[linkage]

    # Partition the linkage groups into consensus clusters
    C = Grouper()
    # Initialize the partitions
    for mlg in cc.mlgs:
        C.join(mlg)

    logging.debug("Partition LGs based on {0}".format(ref))
    for mapname in mapnames:
        if mapname == ref:
            continue
        # Compute co-occurrence between LG pairs
        G = defaultdict(int)
        for s in allseqids:
            s = Scaffold(s, cc)
            s.add_LG_pairs(G, (ref, mapname))
        # Convert edge list to adj list
        nodes = defaultdict(list)
        for (a, b), w in G.items():
            nodes[a].append((b, w))
        # Find the best ref LG every non-ref LG matches to
        for n, neighbors in nodes.items():
            if n.split("-")[0] == ref:
                continue
            neighbors = dict(neighbors)
            best_neighbor, best_value = best_no_ambiguous(neighbors, n)
            if best_neighbor is None:
                continue
            C.join(n, best_neighbor)

    partitions = defaultdict(list)
    # Partition the scaffolds and assign them to one consensus
    for s in allseqids:
        s = Scaffold(s, cc)
        seqid = s.seqid
        counts = {}
        for mlg, count in s.mlg_counts.items():
#.........这里部分代码省略.........
开发者ID:yangjl,项目名称:jcvi,代码行数:103,代码来源:allmaps.py

示例14: enrich

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def enrich(args):
    """
    %prog enrich omgfile groups ntaxa > enriched.omg

    Enrich OMG output by pulling genes misses by OMG.
    """
    p = OptionParser(enrich.__doc__)
    p.add_option("--ghost", default=False, action="store_true",
                 help="Add ghost homologs already used [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    omgfile, groupsfile, ntaxa = args
    ntaxa = int(ntaxa)
    ghost = opts.ghost

    # Get gene pair => weight mapping
    weights = get_edges()
    info = get_info()
    # Get gene => taxon mapping
    info = dict((k, v.split()[5]) for k, v in info.items())

    groups = Grouper()

    fp = open(groupsfile)
    for row in fp:
        members = row.strip().split(",")
        groups.join(*members)

    logging.debug("Imported {0} families with {1} members.".\
                    format(len(groups), groups.num_members))

    seen = set()
    omggroups = Grouper()
    fp = open(omgfile)
    for row in fp:
        genes, idxs = row.split()
        genes = genes.split(",")
        seen.update(genes)
        omggroups.join(*genes)

    nmembers = omggroups.num_members
    logging.debug("Imported {0} OMG families with {1} members.".\
                    format(len(omggroups), nmembers))
    assert nmembers == len(seen)

    alltaxa = set(str(x) for x in range(ntaxa))
    recruited = []
    fp = open(omgfile)
    for row in fp:
        genes, idxs = row.split()
        genes = genes.split(",")
        a = genes[0]

        idxs = set(idxs.split(","))
        missing_taxa = alltaxa - idxs
        if not missing_taxa:
            print row.rstrip()
            continue

        leftover = groups[a]
        if not ghost:
            leftover = set(leftover) - seen

        if not leftover:
            print row.rstrip()
            continue

        leftover_sorted_by_taxa = dict((k, \
                             [x for x in leftover if info[x] == k]) \
                                for k in missing_taxa)

        #print genes, leftover
        #print leftover_sorted_by_taxa
        solutions = []
        for solution in product(*leftover_sorted_by_taxa.values()):
            score = sum(weights.get((a, b), 0) for a in solution for b in genes)
            if score == 0:
                continue
            score += sum(weights.get((a, b), 0) for a, b in combinations(solution, 2))
            solutions.append((score, solution))
            #print solution, score

        best_solution = max(solutions) if solutions else None
        if best_solution is None:
            print row.rstrip()
            continue

        #print "best ==>", best_solution
        best_score, best_addition = best_solution
        genes.extend(best_addition)
        recruited.extend(best_addition)

        genes = sorted([(info[x], x) for x in genes])
        idxs, genes = zip(*genes)

        if ghost:  # decorate additions so it's clear that they were added
            pgenes = []
#.........这里部分代码省略.........
开发者ID:Hensonmw,项目名称:jcvi,代码行数:103,代码来源:catalog.py

示例15: napus

# 需要导入模块: from jcvi.utils.grouper import Grouper [as 别名]
# 或者: from jcvi.utils.grouper.Grouper import join [as 别名]
def napus(args):
    """
    %prog napus napus.bed brapa.boleracea.i1.blocks diploid.napus.fractionation

    Extract napus gene loss vs diploid ancestors. We are looking specifically
    for anything that has the pattern:

        BR - BO    or     BR - BO
        |                       |
        AN                     CN

    Step 1: extract BR - BO syntenic pairs
    Step 2: get diploid gene retention patterns from BR or BO as query
    Step 3: look for if AN or CN is NS(non-syntenic) or NF(not found) and
    specifically with NS, the NS location is actually the homeologous site.
    Step 4: categorize gene losses into singleton, or segmental (defined as
    consecutive losses with a maximum skip of 1
    """
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(napus.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    napusbed, brbo, dpnp = args
    retention = {}
    fp = open(dpnp)
    for row in fp:
        seqid, query, hit = row.split()
        retention[query] = hit

    order = Bed(napusbed).order

    quartetsfile = "quartets"
    fp = open(brbo)
    fw = open(quartetsfile, "w")
    AL = "AN LOST"
    CL = "CN LOST"
    for row in fp:
        br, bo = row.split()
        if '.' in (br, bo):
            continue
        an, cn = retention[br], retention[bo]
        row = "\t".join((br, bo, an, cn))
        if '.' in (an, cn):
            #print row
            continue

        # label loss candidates
        antag, anrange = get_tag(an, order)
        cntag, cnrange = get_tag(cn, order)

        if range_overlap(anrange, cnrange):
            if (antag, cntag) == ("NS", None):
                row = row + "\t{0}|{1}".format(AL, br)
            if (antag, cntag) == (None, "NS"):
                row = row + "\t{0}|{1}".format(CL, bo)

        print >> fw, row
    fw.close()

    logging.debug("Quartets and gene losses written to `{0}`.".\
                    format(quartetsfile))

    # Parse the quartets file to extract singletons vs.segmental losses
    fp = open(quartetsfile)
    fw = open(quartetsfile + ".summary", "w")
    data = [x.rstrip().split("\t") for x in fp]
    skip = 1  # max distance between losses

    g = Grouper()
    losses = [(len(x) == 5) for x in data]
    for i, d in enumerate(losses):
        if not d:
            continue
        g.join(i, i)
        itag = data[i][-1].split("|")[0]
        for j in xrange(i + 1, i + skip + 1):
            jtag = data[j][-1].split("|")[0]
            if j < len(losses) and losses[j] and itag == jtag:
                g.join(i, j)

    losses = list(g)
    singletons = [x for x in losses if len(x) == 1]
    segments = [x for x in losses if len(x) > 1]
    ns, nm = len(singletons), len(segments)
    assert len(losses) == ns + nm

    grab_tag = lambda pool, tag: \
            [x for x in pool if all(data[z][-1].startswith(tag) for z in x)]

    an_loss_singletons = grab_tag(singletons, AL)
    cn_loss_singletons = grab_tag(singletons, CL)
    als, cls = len(an_loss_singletons), len(cn_loss_singletons)

    an_loss_segments = grab_tag(segments, AL)
    cn_loss_segments = grab_tag(segments, CL)
    alm, clm = len(an_loss_segments), len(cn_loss_segments)
#.........这里部分代码省略.........
开发者ID:Hensonmw,项目名称:jcvi,代码行数:103,代码来源:fractionation.py


注:本文中的jcvi.utils.grouper.Grouper.join方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。