本文整理匯總了Python中jcvi.utils.grouper.Grouper類的典型用法代碼示例。如果您正苦於以下問題:Python Grouper類的具體用法?Python Grouper怎麽用?Python Grouper使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了Grouper類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: group
def group(args):
"""
%prog group anchorfiles
Group the anchors into ortho-groups. Can input multiple anchor files.
"""
p = OptionParser(group.__doc__)
opts, args = p.parse_args(args)
if len(args) < 1:
sys.exit(not p.print_help())
anchorfiles = args
groups = Grouper()
for anchorfile in anchorfiles:
ac = AnchorFile(anchorfile)
for a, b in ac.iter_pairs():
groups.join(a, b)
ngroups = len(groups)
nmembers = sum(len(x) for x in groups)
logging.debug("Created {0} groups with {1} members.".\
format(ngroups, nmembers))
for g in groups:
print ",".join(sorted(g))
示例2: iter_partitions
def iter_partitions(self, cutoff=.3, gtr=True):
from jcvi.utils.grouper import Grouper
if gtr:
names = self.gnames
fp = open(self.gtrfile)
else:
names = self.anames
fp = open(self.atrfile)
reader = csv.reader(fp, delimiter="\t")
grouper = Grouper()
for g in map(GTRLine._make, reader):
d = float(g.dist)
if d < cutoff:
continue
grouper.join(g.parent, g.left_child, g.right_child)
parents = {}
for i, group in enumerate(grouper):
for g in group:
parents[g] = i
partitions = [[parents.get(a, x), x] for a, x in names]
for key, parts in groupby(partitions, key=lambda x: x[0]):
yield list(x[1] for x in parts)
示例3: pile
def pile(args):
"""
%prog pile abedfile bbedfile > piles
Call intersectBed on two bedfiles.
"""
from jcvi.utils.grouper import Grouper
p = OptionParser(pile.__doc__)
p.add_option("--minOverlap", default=0, type="int",
help="Minimum overlap required [default: %default]")
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
abedfile, bbedfile = args
iw = intersectBed_wao(abedfile, bbedfile, minOverlap=opts.minOverlap)
groups = Grouper()
for a, b in iw:
groups.join(a.accn, b.accn)
ngroups = 0
for group in groups:
if len(group) > 1:
ngroups += 1
print "|".join(group)
logging.debug("A total of {0} piles (>= 2 members)".format(ngroups))
示例4: get_2D_overlap
def get_2D_overlap(chain, eclusters):
"""
Implements a sweep line algorithm, that has better running time than naive O(n^2):
assume block has x_ends, and y_ends for the bounds
1. sort x_ends, and take a sweep line to scan the x_ends
2. if left end, test y-axis intersection of current block with `active` set;
also put this block in the `active` set
3. if right end, remove block from the `active` set
"""
mergeables = Grouper()
active = set()
x_ends = []
for i, (range_x, range_y, score) in enumerate(eclusters):
chr, left, right = range_x
x_ends.append((chr, left, 0, i)) # 0/1 for left/right-ness
x_ends.append((chr, right, 1, i))
x_ends.sort()
chr_last = ""
for chr, pos, left_right, i in x_ends:
if chr != chr_last: active.clear()
if left_right == 0:
active.add(i)
for x in active:
# check y-overlap
if range_overlap(eclusters[x][1], eclusters[i][1]):
mergeables.join(x, i)
else: # right end
active.remove(i)
chr_last = chr
return mergeables
示例5: fuse
def fuse(args):
"""
%prog fuse *.bed *.anchors
Fuse gene orders based on anchors file.
"""
from jcvi.algorithms.graph import BiGraph
p = OptionParser(fuse.__doc__)
opts, args = p.parse_args(args)
if len(args) < 1:
sys.exit(not p.print_help())
bedfiles = [x for x in args if x.endswith(".bed")]
anchorfiles = [x for x in args if x.endswith(".anchors")]
# TODO: Use Markov clustering to sparsify the edges
families = Grouper()
for anchorfile in anchorfiles:
af = AnchorFile(anchorfile)
for a, b, block_id in af.iter_pairs():
families.join(a, b)
allowed = set(families.keys())
logging.debug("Total families: {}, Gene members: {}"
.format(len(families), len(allowed)))
# TODO: Use C++ implementation of BiGraph() when available
# For now just serialize this to the disk
G = BiGraph()
for bedfile in bedfiles:
bed = Bed(bedfile, include=allowed)
#add_bed_to_graph(G, bed, families)
print_edges(G, bed, families)
示例6: group
def group(args):
"""
%prog group anchorfiles
Group the anchors into ortho-groups. Can input multiple anchor files.
"""
p = OptionParser(group.__doc__)
p.set_outfile()
opts, args = p.parse_args(args)
if len(args) < 1:
sys.exit(not p.print_help())
anchorfiles = args
groups = Grouper()
for anchorfile in anchorfiles:
ac = AnchorFile(anchorfile)
for a, b, idx in ac.iter_pairs():
groups.join(a, b)
logging.debug("Created {0} groups with {1} members.".\
format(len(groups), groups.num_members))
outfile = opts.outfile
fw = must_open(outfile, "w")
for g in groups:
print >> fw, ",".join(sorted(g))
fw.close()
return outfile
示例7: find_synteny_region
def find_synteny_region(query, sbed, data, window, cutoff, colinear=False):
"""
Get all synteny blocks for a query, algorithm is single linkage
anchors are a window centered on query
Two categories of syntenic regions depending on what query is:
(Syntelog): syntenic region is denoted by the syntelog
(Gray gene): syntenic region is marked by the closest flanker
"""
regions = []
ysorted = sorted(data, key=lambda x: x[1])
g = Grouper()
a, b = tee(ysorted)
next(b, None)
for ia, ib in izip(a, b):
pos1, pos2 = ia[1], ib[1]
if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid:
g.join(ia, ib)
for group in sorted(g):
(qflanker, syntelog), (far_flanker, far_syntelog), flanked = \
get_flanker(group, query)
# run a mini-dagchainer here, take the direction that gives us most anchors
if colinear:
y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)]
lis = longest_increasing_subsequence(y_indexed_group)
lds = longest_decreasing_subsequence(y_indexed_group)
if len(lis) >= len(lds):
track = lis
orientation = "+"
else:
track = lds
orientation = "-"
group = [group[i] for (y, i) in track]
xpos, ypos = zip(*group)
score = min(len(set(xpos)), len(set(ypos)))
if qflanker == query:
gray = "S"
else:
gray = "G" if not flanked else "F"
score -= 1 # slight penalty for not finding syntelog
if score < cutoff:
continue
# y-boundary of the block
left, right = group[0][1], group[-1][1]
# this characterizes a syntenic region (left, right).
# syntelog is -1 if it's a gray gene
syn_region = (syntelog, far_syntelog, left,
right, gray, orientation, score)
regions.append(syn_region)
return sorted(regions, key=lambda x: -x[-1]) # decreasing synteny score
示例8: tandem_grouper
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True):
if not flip:
simple_blast = [(b.query, (b.sseqid, b.si)) for b in blast_list if b.evalue < 1e-10]
else:
simple_blast = [(b.subject, (b.qseqid, b.qi)) for b in blast_list if b.evalue < 1e-10]
simple_blast.sort()
standems = Grouper()
for name, hits in groupby(simple_blast, key=lambda x: x[0]):
# these are already sorted.
hits = [x[1] for x in hits]
for ia, a in enumerate(hits[:-1]):
b = hits[ia + 1]
# on the same chr and rank difference no larger than tandem_Nmax
if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]:
standems.join(a[1], b[1])
return standems
示例9: main
def main(blast_file, cds_file, bed_file, N=3):
# get the sizes for the CDS first
f = Fasta(cds_file)
sizes = dict(f.itersizes())
# retrieve the locations
bed = Bed(bed_file).order
# filter the blast file
g = Grouper()
fp = open(blast_file)
for row in fp:
b = BlastLine(row)
query_len = sizes[b.query]
subject_len = sizes[b.subject]
if b.hitlen < min(query_len, subject_len) / 2:
continue
query, subject = gene_name(b.query), gene_name(b.subject)
qi, q = bed[query]
si, s = bed[subject]
if q.seqid == s.seqid and abs(qi - si) <= N:
g.join(query, subject)
# dump the grouper
ngenes, nfamilies = 0, 0
families = []
for group in sorted(g):
if len(group) >= 2:
print ",".join(sorted(group))
ngenes += len(group)
nfamilies += 1
families.append(sorted(group))
longest_family = max(families, key=lambda x: len(x))
# generate reports
print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
",".join(longest_family))
示例10: chain_HSPs
def chain_HSPs(blastlines, xdist=100, ydist=100):
"""
Take a list of BlastLines (or a BlastSlow instance), and returns a list of
BlastLines.
"""
key = lambda x: (x.query, x.subject)
blastlines.sort(key=key)
clusters = Grouper()
for qs, points in groupby(blastlines, key=key):
points = sorted(list(points), \
key=lambda x: (x.qstart, x.qstop, x.sstart, x.sstop))
n = len(points)
for i in xrange(n):
a = points[i]
clusters.join(a)
for j in xrange(i + 1, n):
b = points[j]
if a.orientation != b.orientation:
continue
# x-axis distance
del_x = get_distance(a, b)
if del_x > xdist:
continue
# y-axis distance
del_y = get_distance(a, b, xaxis=False)
if del_y > ydist:
continue
# otherwise join
clusters.join(a, b)
chained_hsps = []
for c in clusters:
chained_hsps.append(combine_HSPs(c))
chained_hsps = sorted(chained_hsps, key=lambda x: -x.score)
return chained_hsps
示例11: chain_HSPs
def chain_HSPs(blast, xdist=100, ydist=100):
"""
Take a list of BlastLines (or a BlastSlow instance), and returns a list of
BlastLines.
"""
key = lambda x: (x.query, x.subject)
blast.sort(key=key)
clusters = Grouper()
for qs, points in groupby(blast, key=key):
points = sorted(list(points), \
key=lambda x: (x.qstart, x.qstop, x.sstart, x.sstop))
n = len(points)
for i in xrange(n):
a = points[i]
clusters.join(a)
for j in xrange(i + 1, n):
b = points[j]
# x-axis distance
del_x = get_distance(a, b)
if del_x > xdist:
break
# y-axis distance
del_y = get_distance(a, b, xaxis=False)
if del_y > ydist:
continue
# otherwise join
clusters.join(a, b)
chained_hsps = [combine_HSPs(x) for x in clusters]
key = lambda x: (x.query, -x.score if x.has_score else 0)
chained_hsps = sorted(chained_hsps, key=key)
return chained_hsps
示例12: tandem_main
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):
if genefam:
N = 1e5
# get the sizes for the CDS first
f = Fasta(cds_file)
sizes = dict(f.itersizes())
# retrieve the locations
bed = Bed(bed_file)
order = bed.order
if is_self:
# filter the blast file
g = Grouper()
fp = open(blast_file)
for row in fp:
b = BlastLine(row)
query_len = sizes[b.query]
subject_len = sizes[b.subject]
if b.hitlen < min(query_len, subject_len)*P/100.:
continue
query = gene_name(b.query, strip_name)
subject = gene_name(b.subject, strip_name)
qi, q = order[query]
si, s = order[subject]
if abs(qi - si) <= N and b.evalue <= evalue:
if genefam:
g.join(query, subject)
elif q.seqid == s.seqid:
g.join(query, subject)
else:
homologs = Grouper()
fp = open(blast_file)
for row in fp:
b = BlastLine(row)
query_len = sizes[b.query]
subject_len = sizes[b.subject]
if b.hitlen < min(query_len, subject_len)*P/100.:
continue
if b.evalue > evalue:
continue
query = gene_name(b.query, strip_name)
subject = gene_name(b.subject, strip_name)
homologs.join(query, subject)
if genefam:
g = homologs
else:
g = Grouper()
for i, atom in enumerate(bed):
for x in range(1, N+1):
if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
homologs.joined(bed[i-x].accn, atom.accn)]):
leni = sizes[bed[i].accn]
lenx = sizes[bed[i-x].accn]
if abs(leni - lenx) > max(leni, lenx)*(1-P/100.):
continue
g.join(bed[i-x].accn, atom.accn)
# dump the grouper
fw = must_open(ofile, "w")
ngenes, nfamilies = 0, 0
families = []
for group in sorted(g):
if len(group) >= 2:
print >>fw, ",".join(sorted(group))
ngenes += len(group)
nfamilies += 1
families.append(sorted(group))
longest_family = max(families, key=lambda x: len(x))
# generate reports
print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
",".join(longest_family))
return families
示例13: path
def path(args):
"""
%prog path input.bed scaffolds.fasta
Construct golden path given a set of genetic maps. The respective weight for
each map is given in file `weights.txt`. The map with the highest weight is
considered the pivot map. The final output is an AGP file that contains
ordered scaffolds.
"""
oargs = args
p = OptionParser(path.__doc__)
p.add_option("-b", "--bedfile", help=SUPPRESS_HELP)
p.add_option("-s", "--fastafile", help=SUPPRESS_HELP)
p.add_option("-w", "--weightsfile", default="weights.txt",
help="Use weights from file")
p.add_option("--distance", default="rank", choices=distance_choices,
help="Distance function when building initial consensus")
p.add_option("--linkage", default="double", choices=linkage_choices,
help="Linkage function when building initial consensus")
p.add_option("--gapsize", default=100, type="int",
help="Insert gaps of size between scaffolds")
p.add_option("--ngen", default=500, type="int",
help="Iterations in GA, more ~ slower")
p.add_option("--npop", default=100, type="int",
help="Population size in GA, more ~ slower")
p.add_option("--seqid", help="Only run partition with this seqid")
p.add_option("--links", default=10, type="int",
help="Only plot matchings more than")
p.add_option("--noplot", default=False, action="store_true",
help="Do not visualize the alignments")
p.set_cpus(cpus=16)
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
inputbed, fastafile = args
inputbed = opts.bedfile or inputbed
fastafile = opts.fastafile or fastafile
pf = inputbed.rsplit(".", 1)[0]
bedfile = pf + ".bed"
weightsfile = opts.weightsfile
gapsize = opts.gapsize
ngen = opts.ngen
npop = opts.npop
cpus = opts.cpus
if sys.version_info[:2] < (2, 7):
logging.debug("Python version: {0}. CPUs set to 1.".\
format(sys.version.splitlines()[0].strip()))
cpus = 1
function = get_function(opts.distance)
cc = Map(bedfile, function)
mapnames = cc.mapnames
allseqids = cc.seqids
weights = Weights(weightsfile, mapnames)
pivot = weights.pivot
ref = weights.ref
linkage = opts.linkage
oseqid = opts.seqid
logging.debug("Linkage function: {0}-linkage".format(linkage))
linkage = {"single": min, "double": double_linkage, "complete": max,
"average": np.mean, "median": np.median}[linkage]
# Partition the linkage groups into consensus clusters
C = Grouper()
# Initialize the partitions
for mlg in cc.mlgs:
C.join(mlg)
logging.debug("Partition LGs based on {0}".format(ref))
for mapname in mapnames:
if mapname == ref:
continue
# Compute co-occurrence between LG pairs
G = defaultdict(int)
for s in allseqids:
s = Scaffold(s, cc)
s.add_LG_pairs(G, (ref, mapname))
# Convert edge list to adj list
nodes = defaultdict(list)
for (a, b), w in G.items():
nodes[a].append((b, w))
# Find the best ref LG every non-ref LG matches to
for n, neighbors in nodes.items():
if n.split("-")[0] == ref:
continue
neighbors = dict(neighbors)
best_neighbor, best_value = best_no_ambiguous(neighbors, n)
if best_neighbor is None:
continue
C.join(n, best_neighbor)
partitions = defaultdict(list)
# Partition the scaffolds and assign them to one consensus
for s in allseqids:
s = Scaffold(s, cc)
seqid = s.seqid
counts = {}
for mlg, count in s.mlg_counts.items():
#.........這裏部分代碼省略.........
示例14: enrich
def enrich(args):
"""
%prog enrich omgfile groups ntaxa > enriched.omg
Enrich OMG output by pulling genes misses by OMG.
"""
p = OptionParser(enrich.__doc__)
p.add_option("--ghost", default=False, action="store_true",
help="Add ghost homologs already used [default: %default]")
opts, args = p.parse_args(args)
if len(args) != 3:
sys.exit(not p.print_help())
omgfile, groupsfile, ntaxa = args
ntaxa = int(ntaxa)
ghost = opts.ghost
# Get gene pair => weight mapping
weights = get_edges()
info = get_info()
# Get gene => taxon mapping
info = dict((k, v.split()[5]) for k, v in info.items())
groups = Grouper()
fp = open(groupsfile)
for row in fp:
members = row.strip().split(",")
groups.join(*members)
logging.debug("Imported {0} families with {1} members.".\
format(len(groups), groups.num_members))
seen = set()
omggroups = Grouper()
fp = open(omgfile)
for row in fp:
genes, idxs = row.split()
genes = genes.split(",")
seen.update(genes)
omggroups.join(*genes)
nmembers = omggroups.num_members
logging.debug("Imported {0} OMG families with {1} members.".\
format(len(omggroups), nmembers))
assert nmembers == len(seen)
alltaxa = set(str(x) for x in range(ntaxa))
recruited = []
fp = open(omgfile)
for row in fp:
genes, idxs = row.split()
genes = genes.split(",")
a = genes[0]
idxs = set(idxs.split(","))
missing_taxa = alltaxa - idxs
if not missing_taxa:
print row.rstrip()
continue
leftover = groups[a]
if not ghost:
leftover = set(leftover) - seen
if not leftover:
print row.rstrip()
continue
leftover_sorted_by_taxa = dict((k, \
[x for x in leftover if info[x] == k]) \
for k in missing_taxa)
#print genes, leftover
#print leftover_sorted_by_taxa
solutions = []
for solution in product(*leftover_sorted_by_taxa.values()):
score = sum(weights.get((a, b), 0) for a in solution for b in genes)
if score == 0:
continue
score += sum(weights.get((a, b), 0) for a, b in combinations(solution, 2))
solutions.append((score, solution))
#print solution, score
best_solution = max(solutions) if solutions else None
if best_solution is None:
print row.rstrip()
continue
#print "best ==>", best_solution
best_score, best_addition = best_solution
genes.extend(best_addition)
recruited.extend(best_addition)
genes = sorted([(info[x], x) for x in genes])
idxs, genes = zip(*genes)
if ghost: # decorate additions so it's clear that they were added
pgenes = []
#.........這裏部分代碼省略.........
示例15: napus
def napus(args):
"""
%prog napus napus.bed brapa.boleracea.i1.blocks diploid.napus.fractionation
Extract napus gene loss vs diploid ancestors. We are looking specifically
for anything that has the pattern:
BR - BO or BR - BO
| |
AN CN
Step 1: extract BR - BO syntenic pairs
Step 2: get diploid gene retention patterns from BR or BO as query
Step 3: look for if AN or CN is NS(non-syntenic) or NF(not found) and
specifically with NS, the NS location is actually the homeologous site.
Step 4: categorize gene losses into singleton, or segmental (defined as
consecutive losses with a maximum skip of 1
"""
from jcvi.utils.cbook import SummaryStats
p = OptionParser(napus.__doc__)
opts, args = p.parse_args(args)
if len(args) != 3:
sys.exit(not p.print_help())
napusbed, brbo, dpnp = args
retention = {}
fp = open(dpnp)
for row in fp:
seqid, query, hit = row.split()
retention[query] = hit
order = Bed(napusbed).order
quartetsfile = "quartets"
fp = open(brbo)
fw = open(quartetsfile, "w")
AL = "AN LOST"
CL = "CN LOST"
for row in fp:
br, bo = row.split()
if '.' in (br, bo):
continue
an, cn = retention[br], retention[bo]
row = "\t".join((br, bo, an, cn))
if '.' in (an, cn):
#print row
continue
# label loss candidates
antag, anrange = get_tag(an, order)
cntag, cnrange = get_tag(cn, order)
if range_overlap(anrange, cnrange):
if (antag, cntag) == ("NS", None):
row = row + "\t{0}|{1}".format(AL, br)
if (antag, cntag) == (None, "NS"):
row = row + "\t{0}|{1}".format(CL, bo)
print >> fw, row
fw.close()
logging.debug("Quartets and gene losses written to `{0}`.".\
format(quartetsfile))
# Parse the quartets file to extract singletons vs.segmental losses
fp = open(quartetsfile)
fw = open(quartetsfile + ".summary", "w")
data = [x.rstrip().split("\t") for x in fp]
skip = 1 # max distance between losses
g = Grouper()
losses = [(len(x) == 5) for x in data]
for i, d in enumerate(losses):
if not d:
continue
g.join(i, i)
itag = data[i][-1].split("|")[0]
for j in xrange(i + 1, i + skip + 1):
jtag = data[j][-1].split("|")[0]
if j < len(losses) and losses[j] and itag == jtag:
g.join(i, j)
losses = list(g)
singletons = [x for x in losses if len(x) == 1]
segments = [x for x in losses if len(x) > 1]
ns, nm = len(singletons), len(segments)
assert len(losses) == ns + nm
grab_tag = lambda pool, tag: \
[x for x in pool if all(data[z][-1].startswith(tag) for z in x)]
an_loss_singletons = grab_tag(singletons, AL)
cn_loss_singletons = grab_tag(singletons, CL)
als, cls = len(an_loss_singletons), len(cn_loss_singletons)
an_loss_segments = grab_tag(segments, AL)
cn_loss_segments = grab_tag(segments, CL)
alm, clm = len(an_loss_segments), len(cn_loss_segments)
#.........這裏部分代碼省略.........