本文整理匯總了Python中CGAT.Genomics類的典型用法代碼示例。如果您正苦於以下問題:Python Genomics類的具體用法?Python Genomics怎麽用?Python Genomics使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了Genomics類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: updateIndels
def updateIndels(self, snp, is_negative_strand):
contig = snp.chromosome
lcontig = self.mFasta.getLength(contig)
# get location of insertion/deletion. The location
# is after position, hence get position and position + 1
code = self.mAnnotations.getSequence(contig, "+", snp.pos, snp.pos + 2)
self.mCode = code
variants = snp.genotype.split("/")
for variant in variants:
if variant[0] == "*":
self.mVariantType.append("W")
elif variant[0] == "+":
toinsert = variant[1:]
self.mVariantType.append("I")
elif variant[0] == "-":
todelete = variant[1:]
# deletions need to be looked at in a wider range
self.mVariantType.append("D")
else:
raise ValueError("unknown variant sign '%s'" % variant[0])
# ignore non-coding Indels
if code[0] and code[1] not in 'abcABC':
return
if is_negative_strand:
variants = [Genomics.complement(x) for x in variants]
for reference_codon in self.mReferenceCodons:
variants = snp.genotype.split("/")
variants = [x[1:] for x in variants]
for variant in variants:
if len(variant) % 3 != 0:
self.mVariantCodons.append("!")
else:
self.mVariantCodons.append(variant)
self.mVariantAAs.extend(
[Genomics.translate(x) for x in self.mVariantCodons])
示例2: countMotifs
def countMotifs(infile, motifs):
'''find regular expression *motifs* in
sequences within fasta formatted *infile*.
'''
it = FastaIterator.FastaIterator(infile)
positions = []
while 1:
try:
seq = it.next()
except StopIteration:
break
if not seq:
break
rseq = Genomics.complement(seq.sequence)
lsequence = len(seq.sequence)
pos = []
for motif, pattern in motifs:
for x in pattern.finditer(seq.sequence):
pos.append((motif, "+", x.start(), x.end()))
for x in pattern.finditer(rseq):
pos.append(
(motif, "-", lsequence - x.end(), lsequence - x.start()))
positions.append((seq.title, pos))
return positions
示例3: updateSNPs
def updateSNPs(self, snp, is_negative_strand, pos):
"""update SNPs."""
contig = snp.chromosome
lcontig = self.mFasta.getLength(contig)
reference_base = snp.reference_base
if snp.genotype in "ACGTacgt":
# homozygous substitution
self.mVariantType.append("O")
else:
# heterozygous substitution
self.mVariantType.append("E")
# switch reference strand codon to correct strand
if reference_base != "*" and is_negative_strand:
reference_base = Genomics.complement(reference_base)
# collect all possible variants of reference codons
for reference_codon in self.mReferenceCodons:
self.mReferenceAAs.append(Genomics.translate(reference_codon))
# process single base changes
variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)
if reference_codon[pos] != reference_base:
raise ValueError(
"base mismatch at %i (codon=%s,%i): codon:%s != genome:%s; `%s`"
% (snp.pos, reference_codon, pos, reference_codon[pos], reference_base, ";".join(map(str, snp)))
)
for variant_base in variant_bases:
if is_negative_strand:
variant_base = Genomics.complement(variant_base)
self.mVariantAAs.extend([Genomics.translate(x) for x in self.mVariantCodons])
示例4: main
#.........這裏部分代碼省略.........
ninput = len(seqs)
map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) )
matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix,
samples = options.iterations )
results = matcher.run( masked_seqs,
options.arrangements,
qvalue_threshold = options.qvalue_threshold )
if options.combine:
results = Nubiscan.combineMotifs( results )
for r in results:
if r.alternatives:
alternatives = ",".join( [x.arrangement for x in r.alternatives ] )
else:
alternatives = ""
options.stdout.write( "\t".join( (
map_id2title[r.id],
"%i" % r.start,
"%i" % r.end,
r.strand,
r.arrangement,
"%6.4f" % r.score,
"%6.4f" % r.zscore,
"%6.4e" % r.pvalue,
"%6.4e" % r.qvalue,
alternatives) ) )
if options.add_sequence:
s = masked_seqs[int(r.id)][r.start:r.end]
if r.strand == "-": s = Genomics.complement( s )
s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
options.stdout.write( "\t%s" % s )
options.stdout.write("\n")
noutput += 1
# output stats
if options.output_stats:
outfile = E.openOutputFile( "fdr" )
outfile.write("bin\thist\tnobserved\n" )
for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations):
outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs))
outfile.close()
elif options.fdr_control == "xall":
matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
samples = options.iterations )
# collect all results
matches = []
for seq in FastaIterator.iterate(options.stdin):
ninput += 1
mm = matcher.run( seq.sequence,
options.arrangements,
qvalue_threshold = None )
for m in mm:
matches.append( m._replace( sequence = seq.title ) )
# estimate qvalues for all matches across all sequences
示例5: main
def main( argv = None ):
parser = E.OptionParser( version = "%prog version: $Id: analyze_codonbias_shannon.py 2864 2010-03-03 10:18:16Z andreas $",
usage = globals()["__doc__"] )
parser.add_option( "-c", "--is-cds", dest="is_cds", action="store_true",
help = "input are cds (nucleotide) sequences [%default]" )
parser.set_defaults(
is_cds = False,
)
(options, args) = E.Start( parser, argv = argv )
options.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n" )
alphabet = "ACDEFGHIKLMNPQRSTVWY"
snpid = 0
for entry in FastaIterator.iterate( options.stdin ):
identifier = entry.title
if options.is_cds:
cds_sequence = entry.sequence.upper()
assert len(cds_sequence) % 3 == 0, \
"length of sequence '%s' is not a multiple of 3" % entry.title
sequence = Genomics.translate( cds_sequence )
weights = []
for pos, cds_pos in enumerate(range( 0, len(cds_sequence), 3)):
codon = cds_sequence[cds_pos:cds_pos+3]
counts = collections.defaultdict(int)
for x in range(0,3):
rna = codon[x]
for na in "ACGT":
if na == rna: continue
taa = Genomics.translate(codon[:x] + na + codon[x+1:])
counts[taa] += 1
weights.append( counts )
else:
sequence = entry.sequence.upper()
counts = {}
for x in alphabet: counts[x] = 1
weights = [counts] * len(sequence)
for pos, ref in enumerate( sequence ):
if ref not in alphabet: continue
w = weights[pos]
t = float(sum(w.values()))
for variant in alphabet:
if variant == ref: continue
snpid +=1
options.stdout.write(
"%s\n" % "\t".join(
( "%010i" % snpid,
identifier,
str(pos+1),
ref,
variant,
"%i" % w[variant],
"%6.4f" % (w[variant] / t),
)))
E.Stop()
示例6: updateVariants
def updateVariants(variants, lcontig, strand, phased=True):
'''update variants such that they use same coordinate
system (and strand) as the transcript
fixes 1-ness of variants
'''
new_variants = []
is_positive = Genomics.IsPositiveStrand(strand)
for variant in variants:
pos = variant.pos
genotype = bytes(variant.genotype)
reference = bytes(variant.reference)
# fix 1-ness of variants
# pos -= 1
if len(genotype) == 1:
variantseqs = list(Genomics.decodeGenotype(genotype))
has_wildtype = reference in variantseqs
action = "="
start, end = pos, pos + 1
else:
variantseqs = [x[1:] for x in genotype.split("/")]
lvariant = max([len(x) for x in variantseqs])
if not phased:
variantseqs = [x for x in variantseqs if x]
has_wildtype = "*" in genotype
if "+" in genotype and "-" in genotype:
# both insertion and deletion at position
# the range is given by the deletion
# see below for explanations
if genotype.startswith("+"):
action = ">"
variantseqs[1] += "-" * (lvariant - len(variantseqs[1]))
else:
action = "<"
variantseqs[0] += "-" * (lvariant - len(variantseqs[0]))
start, end = pos + 1, pos + lvariant + 1
elif "-" in genotype:
action = "-"
# samtools: deletions are after the base denoted by snp.position
# * <- deletion at 1
# 0 1 2 3 4 5 6
# - -
# 6 5 4 3 2 1 0
# deletion of 2+3 = (2,4)
# on reverse: (7-4, 7-2) = (3,5)
start, end = pos + 1, pos + lvariant + 1
# deletions of unequal length are filled up with "-"
# This is necessary to deal with negative strands:
# -at/-atg on the positive strand deletes a t [g]
# -at/-atg on the negative strand deletes [g] t a
variantseqs = [x + "-" * (lvariant - len(x))
for x in variantseqs]
elif "+" in genotype:
action = "+"
# indels are after the base denoted by position
# as region use both flanking base so that negative strand
# coordinates work
# insertion between position 2 and 3
# * <- insection at pos 2
# 0 1 2i3 4
# 4 3 2i1 0
# is insertion between 1 and 2 in reverse
# including both flanking residues makes it work:
# (2,3) = (5-3,5-2) = (2,3)
# but:
# (2,4) = (5-4,5-2) = (1,3)
start, end = pos, pos + 2
# revert strand
if not is_positive:
reference = Genomics.complement(reference)
variantseqs = [Genomics.complement(x.upper()) for x in variantseqs]
start, end = lcontig - end, lcontig - start
new_variants.append(ExtendedVariant._make((
start, end, reference.upper(), action, has_wildtype, variantseqs)))
return new_variants
示例7: _alignToProfile
def _alignToProfile( infile, outfile,
min_score = 0 ):
'''align sequences in *infile* against mali
Only alignments with a score higher than *min_score* are accepted.
Output multiple alignment in fasta format to *outfile* and a table
in :file:`outfile.log`.
'''
mali = Mali.Mali()
mali.readFromFile( open("../data/mouse.fasta") )
src_mali = Mali.convertMali2Alignlib( mali )
E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() ))
# add pseudocounts
profile_mali = mali.getClone()
n = profile_mali.getNumColumns()
for x in "ACGT":
for y in range(0,2):
profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n )
profile_mali = Mali.convertMali2Alignlib( profile_mali )
alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) )
alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() )
# bg = alignlib.FrequencyVector()
# bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
# alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
# alignlib.makeSubstitutionMatrixDNA4(),
# bg,
# "ACGTN",
# 10.0, 1.0) )
profile = alignlib.makeProfile( profile_mali )
alignment_mode = alignlib.ALIGNMENT_WRAP
alignator = alignlib.makeAlignatorDPFull( alignment_mode,
-5.0,
-0.5 )
map_seq2profile = alignlib.makeAlignmentVector()
map_rseq2profile = alignlib.makeAlignmentVector()
profile.prepare()
# print profile
build_mali = alignlib.makeMultAlignment()
m = alignlib.makeAlignmentVector()
m.addDiagonal( 0, n, 0 )
build_mali.add( src_mali, m )
outf = open( outfile, "w" )
outf_log = open( outfile + ".info", "w" )
outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" )
sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
ids = []
for pid in mali.getIdentifiers():
sequences.append( re.sub( "-", "", mali[pid] ) )
ids.append( pid )
# print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))
c = E.Counter()
for s in FastaIterator.FastaIterator( open(infile)):
E.debug("adding %s" % s.title )
c.input += 1
rsequence = Genomics.complement(s.sequence)
seq = alignlib.makeSequence( s.sequence )
rseq = alignlib.makeSequence( rsequence )
alignator.align( map_seq2profile, seq, profile )
alignator.align( map_rseq2profile, rseq, profile )
if map_seq2profile.getScore() > map_rseq2profile.getScore():
m, seq, sequence = map_seq2profile, seq, s.sequence
else:
m, seq, sequence = map_rseq2profile, rseq, rsequence
if m.getLength() == 0:
c.skipped += 1
continue
if m.getScore() < min_score:
c.skipped += 1
continue
r = getParts( m )
covered = 0
for mm in r:
build_mali.add( mm )
sequences.append( sequence )
#.........這裏部分代碼省略.........
示例8: _buildAllele
#.........這裏部分代碼省略.........
", len(exon)=", exon.end - exon.start, \
", offsets=%i,%i," % (start_offset, end_offset), \
", offset at start=", _getOffset( exon.start, offsets), \
", offset at end=", _getOffset(exon.end, offsets)
for exon in transcript[1:]:
last_exon_sequence = exon_sequence
last_start_offset, last_end_offset = start_offset, end_offset
# get the next intron/exon parameters
exon_key = (exon.start, exon.end)
exon_sequence = exons[exon_key]
start_offset, end_offset = _getEndOffsets(exon_sequence)
intron_key = (last_end, exon.start)
if last_end == exon.start:
# catch empty introns
intron_sequence = []
intron_key = None
else:
intron_sequence = introns[intron_key]
intron_seq = "".join(intron_sequence)
###################################################
###################################################
###################################################
# add preceding intron
new_exon = True
if len(intron_seq) > frameshiftsize:
intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType(
intron_seq)
if intron_name == "unknown":
if intron_seq[:2].islower() and intron_seq[-2:].islower():
E.debug("%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s" %
(transcript_id, intron_name, intron_seq5, intron_seq3))
nsplice_noncanonical += 1
else:
is_splice_truncated = True
E.debug("%s: transcript has splice truncated allele: %s: %s:%s" %
(transcript_id, intron_name, intron_seq5, intron_seq3))
break
# start a new exon
cds_starts.append(lcds)
else:
# treat as frameshifting intron
#
# frame-shifting introns are checked if they are
# fixed by indels either in the intron itself or
# the terminal exon sequence. To this end, the effective
# size of the intron is computed:
# effective size of intron =
# indels at terminal x bases at previous exon
# + size of intron
# + indels at terminal x bases at next exon
effective_intron_size = len(intron_seq)
previous_indels = _sumIndels(
last_exon_sequence[max(0, -frameshiftsize):])
next_indels = _sumIndels(exon_sequence[:frameshiftsize])
effective_intron_size += previous_indels + next_indels
if previous_indels + next_indels == 0 and len(intron_seq) % 3 == 0:
示例9: Align
def Align(self, method, anchor=0, loglevel=1):
"""align a pair of sequences.
get rid of this and use a method class instead in the future
"""
map_a2b = alignlib_lite.py_makeAlignmentVector()
s1 = "A" * anchor + self.mSequence1 + "A" * anchor
s2 = "A" * anchor + self.mSequence2 + "A" * anchor
self.strand = "+"
if method == "dialign":
dialign = WrapperDialign.Dialign(self.mOptionsDialign)
dialign.Align(s1, s2, map_a2b)
elif method == "blastz":
blastz = WrapperBlastZ.BlastZ(self.mOptionsBlastZ)
blastz.Align(s1, s2, map_a2b)
if blastz.isReverseComplement():
self.strand = "-"
self.mSequence2 = Genomics.complement(self.mSequence2)
elif method == "dialignlgs":
dialignlgs = WrapperDialign.Dialign(self.mOptionsDialignLGS)
dialignlgs.Align(s1, s2, map_a2b)
elif method == "dba":
dba = WrapperDBA.DBA()
dba.Align(s1, s2, map_a2b)
elif method == "clustal":
raise NotImplementedError("clustal wrapper needs to be updated")
clustal = WrapperClustal.Clustal()
clustal.Align(s1, s2, map_a2b)
elif method == "nw":
seq1 = alignlib_lite.py_makeSequence(s1)
seq2 = alignlib_lite.py_makeSequence(s2)
alignator = alignlib_lite.py_makeAlignatorDPFull(alignlib_lite.py_ALIGNMENT_GLOBAL,
gop=-12.0,
gep=-2.0)
alignator.align(map_a2b, seq1, seq2)
elif method == "sw":
seq1 = alignlib_lite.py_makeSequence(s1)
seq2 = alignlib_lite.py_makeSequence(s2)
alignlib_lite.py_performIterativeAlignment(
map_a2b, seq1, seq2, alignator_sw, min_score_sw)
else:
# use callback function
method(s1, s2, map_a2b)
if map_a2b.getLength() == 0:
raise AlignmentError("empty alignment")
if anchor:
map_a2b.removeRowRegion(
anchor + len(self.mSequence1) + 1, map_a2b.getRowTo())
map_a2b.removeRowRegion(1, anchor)
map_a2b.removeColRegion(
anchor + len(self.mSequence2) + 1, map_a2b.getColTo())
map_a2b.removeColRegion(1, anchor)
map_a2b.moveAlignment(-anchor, -anchor)
f = alignlib_lite.py_AlignmentFormatExplicit(map_a2b,
alignlib_lite.py_makeSequence(
self.mSequence1),
alignlib_lite.py_makeSequence(self.mSequence2))
self.mMethod = method
self.mAlignment = map_a2b
self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment
f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment
self.mAlignmentFrom1 = map_a2b.getRowFrom()
self.mAlignmentTo1 = map_a2b.getRowTo()
self.mAlignmentFrom2 = map_a2b.getColFrom()
self.mAlignmentTo2 = map_a2b.getColTo()
self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength()
self.mAligned = self.mLength - self.mNumGaps
self.SetPercentIdentity()
self.SetBlockSizes()
示例10: buildSequenceVariants
def buildSequenceVariants(self, seq, strand, pos, snp):
'''build new sequence by modifying a sequence fragment in seq at
pos with snp.
It is assumed that seq is already oriented according to strand.
The strand is used to revert the snp if necessary.
Note that only sequences different from seq will be returned.
returns is_homozygous, seqs
'''
is_negative_strand = Genomics.IsNegativeStrand(strand)
reference_base = snp.reference_base
if reference_base != "*" and is_negative_strand:
reference_base = Genomics.complement(reference_base)
new_sequences = []
is_homozygous = True
if reference_base != "*":
if seq[pos].upper() != reference_base.upper():
raise ValueError("base mismatch at snp %i, expected %s, got %s in %s at position %i; snp=%s" %
(snp.pos, reference_base, seq[pos], seq, pos,
";".join(map(str, snp))))
# single base changes
variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)
if len(variant_bases) == 1:
is_homozygous = True
else:
is_homozygous = False
for variant_base in variant_bases:
if is_negative_strand:
variant_base = Genomics.complement(variant_base)
s = list(seq)
s[pos] = variant_base
s = "".join(s)
if s != seq:
new_sequences.append(s)
else:
variants = snp.genotype.split("/")
is_homozygous = False
for variant in variants:
s = list(seq)
# samtools denotes insert/deletion after position
# while python is before/at position, hence the pos+1
if variant[0] == "+":
toinsert = variant[1:].upper()
if is_negative_strand:
toinsert = Genomics.complement(toinsert)
s.insert(pos, toinsert)
else:
s.insert(pos + 1, toinsert)
elif variant[0] == "-":
# pos+1+len(x)-1 = pos+len(x)
todelete = variant[1:].upper()
l = len(todelete)
if is_negative_strand:
# delete left of pos
xstart = max(0, pos - l)
xend = pos
todelete = todelete[:min(l, pos)]
else:
# delete right of pos
xstart = pos + 1
xend = min(self.mSize, pos + 1 + l)
todelete = todelete[:self.mSize - (pos + 1)]
deleted = "".join(s[xstart:xend])
if is_negative_strand:
deleted = Genomics.complement(deleted)
if deleted != todelete:
raise ValueError("base mismatch at indel %i, expected %s, got %s in %s at position %i(%i:%i); is_negative_strand=%s, snp=%s" %
(snp.pos, todelete, deleted, seq, pos, xstart, xend,
is_negative_strand,
";".join(map(str, snp))))
del s[xstart:xend]
elif variant[0] == "*":
is_homozygous = True
else:
raise ValueError("unknown variant sign '%s'" % variant[0])
s = "".join(s)
if s != seq:
new_sequences.append(s)
return is_homozygous, new_sequences
示例11: getSequence
def getSequence(self, contig, strand="+", start=0, end=0, converter=None, as_array=False):
"""get a genomic fragment.
A genomic fragment is identified by the coordinates
contig, strand, start, end.
The converter function supplied translated these coordinates
into 0-based coordinates. By default, start and end are assumed
to be pythonic coordinates and are forward/reverse coordinates.
If as_array is set to true, return the AString object. This might
be beneficial for large sequence chunks. If as_array is set to False,
return a python string.
"""
contig = self.getToken(contig)
data = self.mIndex[contig]
# dummy is
# -> pos_seq for seekable streams
# -> block_size for unseekable streams
try:
pos_id, dummy, lsequence = struct.unpack("QQi", data)
except (struct.error, TypeError):
pos_id, dummy, lsequence, points = data
pos_seq = dummy
block_size = dummy
if end == 0:
end = lsequence
if end > lsequence:
raise ValueError("3' coordinate on %s out of bounds: %i > %i" % (contig, end, lsequence))
if start < 0:
raise ValueError("5' coordinate on %s out of bounds: %i < 0" % (contig, start))
if converter:
first_pos, last_pos = converter(start, end, str(strand) in ("+", "1"), lsequence)
elif self.mConverter:
first_pos, last_pos = self.mConverter(start, end, str(strand) in ("+", "1"), lsequence)
else:
first_pos, last_pos = start, end
if str(strand) in ("-", "0", "-1"):
first_pos, last_pos = lsequence - last_pos, lsequence - first_pos
if first_pos == last_pos:
return ""
assert first_pos < last_pos, "first position %i is larger than last position %i " % (first_pos, last_pos)
p = AString()
if self.mNoSeek:
# read directly from position
p.fromstring(self.mDatabaseFile.read(block_size, data[3], first_pos, last_pos))
else:
first_pos += pos_seq
last_pos += pos_seq
self.mDatabaseFile.seek(first_pos)
p.fromstring(self.mDatabaseFile.read(last_pos - first_pos))
if str(strand) in ("-", "0", "-1"):
p = AString(Genomics.complement(str(p)))
if self.mTranslator:
return self.mTranslator.translate(p)
elif as_array:
return p
else:
if IS_PY3:
return p.tostring().decode("ascii")
else:
return p.tostring()
示例12: main
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if not argv:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("-m", "--method", dest="method", type="choice",
choices=('join', ),
help="method to apply [default=%default].")
parser.set_defaults(
method="join",
)
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
if len(args) != 2:
raise ValueError(
"please supply at least two fastq files on the commandline")
fn1, fn2 = args
c = E.Counter()
outfile = options.stdout
if options.method == "join":
# merge based on diagonals in dotplot
iter1 = Fastq.iterate(IOTools.openFile(fn1))
iter2 = Fastq.iterate(IOTools.openFile(fn2))
tuple_size = 2
for left, right in zip(iter1, iter2):
c.input += 1
# build dictionary of tuples
s1, q1 = left.seq, left.quals
d = collections.defaultdict(list)
for x in range(len(s1) - tuple_size):
d[s1[x:x + tuple_size]].append(x)
s2, q2 = right.seq, right.quals
# reverse complement
s2 = Genomics.complement(s2)
q2 = q2[::-1]
# compute list of offsets/diagonals
offsets = collections.defaultdict(int)
for x in range(len(s2) - tuple_size):
c = s2[x:x + tuple_size]
for y in d[c]:
offsets[x - y] += 1
# find maximum diagonal
sorted = sorted([(y, x) for x, y in offsets.items()])
max_count, max_offset = sorted[-1]
E.debug('%s: maximum offset at %i' % (left.identifier,
max_offset))
# simple merge sequence
take = len(s2) - max_offset
merged_seq = s1 + s2[take:]
# simple merge quality scores
merged_quals = q1 + q2[take:]
new_entry = copy.copy(left)
new_entry.seq = merged_seq
new_entry.quals = merged_quals
outfile.write(new_entry)
c.output += 1
# write footer and output benchmark information.
E.info("%s" % str(c))
E.Stop()
示例13: main
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if argv is None:
argv = sys.argv
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("--query-psl-file", dest="filename_query", type="string",
help="fasta filename with queries.")
parser.add_option("--target-psl-file", dest="filename_target", type="string",
help="fasta filename with target.")
parser.add_option("-m", "--method", dest="method", type="choice",
choices=(
"full", "pileup-query", "pileup-target", "gapless"),
help="method to use for constructing the alignment [%default].")
parser.add_option("--forward-query", dest="forward_query", action="store_true",
help="reverse-complement sequences such that query is always on forward strand [%default]")
parser.add_option("--target-prefix", dest="target_prefix", type="string",
help="prefix to use for target [%default].")
parser.add_option("--query-prefix", dest="query_prefix", type="string",
help="prefix to use for query [%default].")
parser.add_option("--id", dest="id", type="choice",
choices=("numeric", "query"),
help="choose type of identifier to use [%default]")
parser.set_defaults(
filename_query=None,
filename_target=None,
method="full",
output_format_id="%06i",
target_prefix="",
query_prefix="",
forward_query=False,
)
(options, args) = E.Start(parser)
if options.filename_query:
query = IndexedFasta.IndexedFasta(options.filename_query)
if options.filename_target:
target = IndexedFasta.IndexedFasta(options.filename_target)
if options.method == "full":
getAlignment = getAlignmentFull
id = 0
for match in Blat.iterator(options.stdin):
if options.loglevel >= 2:
options.stdout.write("# %s\n" % str(match))
m = match.getMapQuery2Target()
m.moveAlignment(-min(match.mQueryBlockStarts), -
min(match.mSbjctBlockStarts))
q = query.getSequence(
match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo)
t = target.getSequence(
match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)
query_ali, sbjct_ali = getAlignment(m, q, t, options)
if match.strand == "-" and options.forward_query:
query_ali = Genomics.complement(query_ali)
sbjct_ali = Genomics.complement(sbjct_ali)
options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" %
(options.query_prefix,
options.output_format_id % id,
match.mQueryId, match.mQueryFrom, match.mQueryTo,
query_ali,
options.target_prefix,
options.output_format_id % id,
match.mSbjctId, match.strand,
match.mSbjctFrom, match.mSbjctTo,
sbjct_ali))
id += 1
E.Stop()
示例14: len
genome = IndexedFasta.IndexedFasta( options.genome_file )
assert options.filename_regions != None, "please supply a gff formatted filename with regions"
regions = GTF.readAsIntervals( GFF.iterator( IOTools.openFile(options.filename_regions, "r" ) ) )
# build pairs for complement
reverse_splice_pairs = []
forward_splice_pairs = options.splice_pairs
left_tokens, right_tokens = {}, {}
x = 0
for a,b in forward_splice_pairs:
assert len(a) == 2, "only two-residue patterns allowed"
assert len(b) == 2, "only two-residue patterns allowed"
ca, cb = Genomics.complement( a ), Genomics.complement( b )
reverse_splice_pairs.append( (b,a) )
left_tokens[a] = x
left_tokens[cb] = x+1
right_tokens[b] = x
right_tokens[ca] = x+1
x += 2
search_area = options.search_area
read_length = options.read_length
joined = options.joined
ninput, noutput = 0, 0
if joined:
outfile_coordinates = IOTools.openFile( options.output_filename_pattern % "coords", "w" )
示例15: main
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if not argv:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(
version="%prog version: $Id: snp2maf.py 2875 2010-03-27 17:42:04Z andreas $", usage=globals()["__doc__"])
parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
help="filename with genome [default=%default].")
parser.add_option("-t", "--tracks", dest="tracks", type="string", action="append",
help="tracks (tablenames) to use in sqlite database [default=%default].")
parser.add_option("-d", "--database", dest="database", type="string",
help="sqlite3 database [default=%default].")
parser.add_option("-r", "--reference", dest="reference", type="string",
help="name of reference [default=%default].")
parser.add_option("-i", "--is-gtf", dest="is_gtf", action="store_true",
help="if set, the gene_id will be added to the alignment header [default=%default].")
parser.add_option("-z", "--compress", dest="compress", action="store_true",
help="compress output with gzip [default=%default].")
parser.add_option("-p", "--pattern-identifier", dest="pattern_track", type="string",
help="regular expression pattern for track [default=%default].")
parser.set_defaults(
genome_file=None,
tracks=[],
database="csvdb",
output=[],
border=0,
reference_name="reference",
pattern_track="(\S+)",
is_gtf=True,
compress=False,
)
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv, add_output_options=True)
ninput, nskipped, noutput = 0, 0, 0
if not options.database or not options.tracks:
raise ValueError("please supply both database and tracks")
if options.genome_file:
fasta = IndexedFasta.IndexedFasta(options.genome_file)
else:
fasta = None
if options.is_gtf:
infile_gff = GTF.iterator(options.stdin)
else:
infile_gff = GTF.iterator(options.stdin)
dbhandle = sqlite3.connect(options.database)
statement = '''SELECT pos, reference, genotype
FROM %(track)s
WHERE contig = '%(contig)s' AND
pos BETWEEN %(extended_start)s and %(extended_end)s
'''
counts = E.Counter()
tracks = options.tracks
try:
translated_tracks = [
re.search(options.pattern_track, track).groups()[0] for track in tracks]
except AttributeError:
raise AttributeError(
"pattern `%s` does not match input tracks." % options.pattern_track)
if options.compress:
outfile = gzip.GzipFile(fileobj=options.stdout)
else:
outfile = options.stdout
outfile.flush()
outfile.write("##maf version=1 program=snp2maf.py\n\n")
for gff in infile_gff:
counts.input += 1
contig = gff.contig
strand = gff.strand
lcontig = fasta.getLength(contig)
region_start, region_end = gff.start, gff.end
if contig.startswith("chr"):
contig = contig[3:]
extended_start = region_start - options.border
extended_end = region_end + options.border
is_positive = Genomics.IsPositiveStrand(strand)
E.info("processing %s" % str(gff))
# collect all variants
all_variants = []
#.........這裏部分代碼省略.........