本文整理汇总了Python中CGAT.Genomics.complement方法的典型用法代码示例。如果您正苦于以下问题:Python Genomics.complement方法的具体用法?Python Genomics.complement怎么用?Python Genomics.complement使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类CGAT.Genomics
的用法示例。
在下文中一共展示了Genomics.complement方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: countMotifs
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def countMotifs(infile, motifs):
'''find regular expression *motifs* in
sequences within fasta formatted *infile*.
'''
it = FastaIterator.FastaIterator(infile)
positions = []
while 1:
try:
seq = it.next()
except StopIteration:
break
if not seq:
break
rseq = Genomics.complement(seq.sequence)
lsequence = len(seq.sequence)
pos = []
for motif, pattern in motifs:
for x in pattern.finditer(seq.sequence):
pos.append((motif, "+", x.start(), x.end()))
for x in pattern.finditer(rseq):
pos.append(
(motif, "-", lsequence - x.end(), lsequence - x.start()))
positions.append((seq.title, pos))
return positions
示例2: updateSNPs
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def updateSNPs(self, snp, is_negative_strand, pos):
"""update SNPs."""
contig = snp.chromosome
lcontig = self.mFasta.getLength(contig)
reference_base = snp.reference_base
if snp.genotype in "ACGTacgt":
# homozygous substitution
self.mVariantType.append("O")
else:
# heterozygous substitution
self.mVariantType.append("E")
# switch reference strand codon to correct strand
if reference_base != "*" and is_negative_strand:
reference_base = Genomics.complement(reference_base)
# collect all possible variants of reference codons
for reference_codon in self.mReferenceCodons:
self.mReferenceAAs.append(Genomics.translate(reference_codon))
# process single base changes
variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)
if reference_codon[pos] != reference_base:
raise ValueError(
"base mismatch at %i (codon=%s,%i): codon:%s != genome:%s; `%s`"
% (snp.pos, reference_codon, pos, reference_codon[pos], reference_base, ";".join(map(str, snp)))
)
for variant_base in variant_bases:
if is_negative_strand:
variant_base = Genomics.complement(variant_base)
self.mVariantAAs.extend([Genomics.translate(x) for x in self.mVariantCodons])
示例3: updateIndels
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def updateIndels(self, snp, is_negative_strand):
contig = snp.chromosome
lcontig = self.mFasta.getLength(contig)
# get location of insertion/deletion. The location
# is after position, hence get position and position + 1
code = self.mAnnotations.getSequence(contig, "+", snp.pos, snp.pos + 2)
self.mCode = code
variants = snp.genotype.split("/")
for variant in variants:
if variant[0] == "*":
self.mVariantType.append("W")
elif variant[0] == "+":
toinsert = variant[1:]
self.mVariantType.append("I")
elif variant[0] == "-":
todelete = variant[1:]
# deletions need to be looked at in a wider range
self.mVariantType.append("D")
else:
raise ValueError("unknown variant sign '%s'" % variant[0])
# ignore non-coding Indels
if code[0] and code[1] not in 'abcABC':
return
if is_negative_strand:
variants = [Genomics.complement(x) for x in variants]
for reference_codon in self.mReferenceCodons:
variants = snp.genotype.split("/")
variants = [x[1:] for x in variants]
for variant in variants:
if len(variant) % 3 != 0:
self.mVariantCodons.append("!")
else:
self.mVariantCodons.append(variant)
self.mVariantAAs.extend(
[Genomics.translate(x) for x in self.mVariantCodons])
示例4: Align
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def Align(self, method, anchor=0, loglevel=1):
"""align a pair of sequences.
get rid of this and use a method class instead in the future
"""
map_a2b = alignlib_lite.py_makeAlignmentVector()
s1 = "A" * anchor + self.mSequence1 + "A" * anchor
s2 = "A" * anchor + self.mSequence2 + "A" * anchor
self.strand = "+"
if method == "dialign":
dialign = WrapperDialign.Dialign(self.mOptionsDialign)
dialign.Align(s1, s2, map_a2b)
elif method == "blastz":
blastz = WrapperBlastZ.BlastZ(self.mOptionsBlastZ)
blastz.Align(s1, s2, map_a2b)
if blastz.isReverseComplement():
self.strand = "-"
self.mSequence2 = Genomics.complement(self.mSequence2)
elif method == "dialignlgs":
dialignlgs = WrapperDialign.Dialign(self.mOptionsDialignLGS)
dialignlgs.Align(s1, s2, map_a2b)
elif method == "dba":
dba = WrapperDBA.DBA()
dba.Align(s1, s2, map_a2b)
elif method == "clustal":
raise NotImplementedError("clustal wrapper needs to be updated")
clustal = WrapperClustal.Clustal()
clustal.Align(s1, s2, map_a2b)
elif method == "nw":
seq1 = alignlib_lite.py_makeSequence(s1)
seq2 = alignlib_lite.py_makeSequence(s2)
alignator = alignlib_lite.py_makeAlignatorDPFull(alignlib_lite.py_ALIGNMENT_GLOBAL,
gop=-12.0,
gep=-2.0)
alignator.align(map_a2b, seq1, seq2)
elif method == "sw":
seq1 = alignlib_lite.py_makeSequence(s1)
seq2 = alignlib_lite.py_makeSequence(s2)
alignlib_lite.py_performIterativeAlignment(
map_a2b, seq1, seq2, alignator_sw, min_score_sw)
else:
# use callback function
method(s1, s2, map_a2b)
if map_a2b.getLength() == 0:
raise AlignmentError("empty alignment")
if anchor:
map_a2b.removeRowRegion(
anchor + len(self.mSequence1) + 1, map_a2b.getRowTo())
map_a2b.removeRowRegion(1, anchor)
map_a2b.removeColRegion(
anchor + len(self.mSequence2) + 1, map_a2b.getColTo())
map_a2b.removeColRegion(1, anchor)
map_a2b.moveAlignment(-anchor, -anchor)
f = alignlib_lite.py_AlignmentFormatExplicit(map_a2b,
alignlib_lite.py_makeSequence(
self.mSequence1),
alignlib_lite.py_makeSequence(self.mSequence2))
self.mMethod = method
self.mAlignment = map_a2b
self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment
f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment
self.mAlignmentFrom1 = map_a2b.getRowFrom()
self.mAlignmentTo1 = map_a2b.getRowTo()
self.mAlignmentFrom2 = map_a2b.getColFrom()
self.mAlignmentTo2 = map_a2b.getColTo()
self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength()
self.mAligned = self.mLength - self.mNumGaps
self.SetPercentIdentity()
self.SetBlockSizes()
示例5: main
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
#.........这里部分代码省略.........
"# processing %s - len=%i\n" % (id, lsequence))
options.stdlog.flush()
total_len += lsequence
lvsequence = lsequence * \
random.gauss(options.coverage_mean, options.coverage_stddev)
covered = 0
counts = numpy.zeros(lsequence)
nreads = 0
error_pids, read_lengths = [], []
while covered < lvsequence:
read_length = int(
random.gauss(options.read_length_mean, options.read_length_stddev))
positive = random.randint(0, 1)
if positive:
start = random.randint(0, lsequence)
end = min(lsequence, start + read_length)
else:
end = random.randint(0, lsequence)
start = max(0, end - read_length)
read_length = end - start
if read_length < options.min_read_length:
continue
segment = sequence[start:end]
if not positive:
segment = Genomics.complement(segment)
noutput += 1
if do_error:
new_segment = getMutatedSequence(segment, options.error_mean)
pid_calc.loadPair(segment, new_segment)
pid = pid_calc.mPID
error_pids.append(pid)
segment = new_segment
else:
pid = 100.0
options.stdout.write(
">%s\n%s\n" % (options.output_format_id % noutput, segment))
if outfile_map:
outfile_map.write(
"%s\t%s\n" % (id, options.output_format_id % noutput))
for x in range(start, end):
counts[x] += 1
nreads += 1
covered += read_length
read_lengths.append(read_length)
if options.loglevel >= 2:
options.stdout.write("# transcript %s: len=%i, nreads=%i, len_mean=%.2f, len_std=%.2f, cov_mean=%.2f, cov_stddev=%.2f\n" % (id,
lsequence,
nreads,
numpy.mean(
示例6: str
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
if options.method == "full":
getAlignment = getAlignmentFull
id = 0
for match in Blat.iterator( options.stdin ):
if options.loglevel >= 2:
options.stdout.write("# %s\n" % str(match))
m = match.getMapQuery2Target()
m.moveAlignment( -min(match.mQueryBlockStarts), -min(match.mSbjctBlockStarts) )
q = query.getSequence( match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo )
t = target.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo )
query_ali, sbjct_ali = getAlignment( m, q, t, options )
if match.strand == "-" and options.forward_query:
query_ali = Genomics.complement( query_ali )
sbjct_ali = Genomics.complement( sbjct_ali )
options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % \
(options.query_prefix,
options.output_format_id % id,
match.mQueryId, match.mQueryFrom, match.mQueryTo,
query_ali,
options.target_prefix,
options.output_format_id % id,
match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo,
sbjct_ali ) )
id += 1
E.Stop()
示例7: main
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
#.........这里部分代码省略.........
"pattern `%s` does not match input tracks." % options.pattern_track)
if options.compress:
outfile = gzip.GzipFile(fileobj=options.stdout)
else:
outfile = options.stdout
outfile.flush()
outfile.write("##maf version=1 program=snp2maf.py\n\n")
for gff in infile_gff:
counts.input += 1
contig = gff.contig
strand = gff.strand
lcontig = fasta.getLength(contig)
region_start, region_end = gff.start, gff.end
if contig.startswith("chr"):
contig = contig[3:]
extended_start = region_start - options.border
extended_end = region_end + options.border
is_positive = Genomics.IsPositiveStrand(strand)
E.info("processing %s" % str(gff))
# collect all variants
all_variants = []
for track in options.tracks:
cc = dbhandle.cursor()
cc.execute(statement % locals())
all_variants.append(map(Variants.Variant._make, cc.fetchall()))
cc.close()
E.debug("%s:%i..%i collected %i variants for %i tracks" % (contig,
region_start, region_end,
sum([
len(x) for x in all_variants]),
len(all_variants)))
reference_seq = fasta.getSequence(
contig, "+", region_start, region_end)
lseq = len(reference_seq)
alleles = collections.defaultdict(list)
# build allele sequences for track and count maximum chars per mali
# column
colcounts = numpy.ones(lseq)
for track, variants in zip(translated_tracks, all_variants):
variants = Variants.updateVariants(variants, lcontig, "+")
a = Variants.buildAlleles(reference_seq,
variants,
reference_start=region_start)
alleles[track] = a
for allele in a:
for pos, c in enumerate(allele):
colcounts[pos] = max(colcounts[pos], len(c))
# realign gapped regions
alignIndels(alleles, colcounts)
if options.is_gtf:
outfile.write("a gene_id=%s\n" % gff.gene_id)
else:
outfile.write("a\n")
maf_format = "s %(name)-30s %(pos)9i %(size)6i %(strand)s %(lcontig)9i %(seq)s\n"
def __addGaps(sequence, colcounts):
'''output gapped sequence.'''
r = []
for x, c in enumerate(sequence):
r.append(c + "-" * (colcounts[x] - len(c)))
return "".join(r)
name = ".".join((options.reference, contig))
if is_positive:
pos = region_start
else:
pos = lcontig - region_start
size = lseq
seq = __addGaps(reference_seq, colcounts)
outfile.write(maf_format % (locals()))
for track in translated_tracks:
for aid, allele in enumerate(alleles[track]):
seq = __addGaps(allele, colcounts)
if not is_positive:
Genomics.complement(seq)
size = len(seq) - seq.count("-")
name = ".".join((track + "-%i" % aid, contig))
outfile.write(maf_format % (locals()))
outfile.write("\n")
E.info("%s" % str(counts))
# write footer and output benchmark information.
E.Stop()
示例8: main
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
#.........这里部分代码省略.........
ninput = len(seqs)
map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) )
matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix,
samples = options.iterations )
results = matcher.run( masked_seqs,
options.arrangements,
qvalue_threshold = options.qvalue_threshold )
if options.combine:
results = Nubiscan.combineMotifs( results )
for r in results:
if r.alternatives:
alternatives = ",".join( [x.arrangement for x in r.alternatives ] )
else:
alternatives = ""
options.stdout.write( "\t".join( (
map_id2title[r.id],
"%i" % r.start,
"%i" % r.end,
r.strand,
r.arrangement,
"%6.4f" % r.score,
"%6.4f" % r.zscore,
"%6.4e" % r.pvalue,
"%6.4e" % r.qvalue,
alternatives) ) )
if options.add_sequence:
s = masked_seqs[int(r.id)][r.start:r.end]
if r.strand == "-": s = Genomics.complement( s )
s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
options.stdout.write( "\t%s" % s )
options.stdout.write("\n")
noutput += 1
# output stats
if options.output_stats:
outfile = E.openOutputFile( "fdr" )
outfile.write("bin\thist\tnobserved\n" )
for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations):
outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs))
outfile.close()
elif options.fdr_control == "xall":
matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
samples = options.iterations )
# collect all results
matches = []
for seq in FastaIterator.iterate(options.stdin):
ninput += 1
mm = matcher.run( seq.sequence,
options.arrangements,
qvalue_threshold = None )
for m in mm:
matches.append( m._replace( sequence = seq.title ) )
# estimate qvalues for all matches across all sequences
示例9: updateVariants
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def updateVariants(variants, lcontig, strand, phased=True):
'''update variants such that they use same coordinate
system (and strand) as the transcript
fixes 1-ness of variants
'''
new_variants = []
is_positive = Genomics.IsPositiveStrand(strand)
for variant in variants:
pos = variant.pos
genotype = bytes(variant.genotype)
reference = bytes(variant.reference)
# fix 1-ness of variants
# pos -= 1
if len(genotype) == 1:
variantseqs = list(Genomics.decodeGenotype(genotype))
has_wildtype = reference in variantseqs
action = "="
start, end = pos, pos + 1
else:
variantseqs = [x[1:] for x in genotype.split("/")]
lvariant = max([len(x) for x in variantseqs])
if not phased:
variantseqs = [x for x in variantseqs if x]
has_wildtype = "*" in genotype
if "+" in genotype and "-" in genotype:
# both insertion and deletion at position
# the range is given by the deletion
# see below for explanations
if genotype.startswith("+"):
action = ">"
variantseqs[1] += "-" * (lvariant - len(variantseqs[1]))
else:
action = "<"
variantseqs[0] += "-" * (lvariant - len(variantseqs[0]))
start, end = pos + 1, pos + lvariant + 1
elif "-" in genotype:
action = "-"
# samtools: deletions are after the base denoted by snp.position
# * <- deletion at 1
# 0 1 2 3 4 5 6
# - -
# 6 5 4 3 2 1 0
# deletion of 2+3 = (2,4)
# on reverse: (7-4, 7-2) = (3,5)
start, end = pos + 1, pos + lvariant + 1
# deletions of unequal length are filled up with "-"
# This is necessary to deal with negative strands:
# -at/-atg on the positive strand deletes a t [g]
# -at/-atg on the negative strand deletes [g] t a
variantseqs = [x + "-" * (lvariant - len(x))
for x in variantseqs]
elif "+" in genotype:
action = "+"
# indels are after the base denoted by position
# as region use both flanking base so that negative strand
# coordinates work
# insertion between position 2 and 3
# * <- insection at pos 2
# 0 1 2i3 4
# 4 3 2i1 0
# is insertion between 1 and 2 in reverse
# including both flanking residues makes it work:
# (2,3) = (5-3,5-2) = (2,3)
# but:
# (2,4) = (5-4,5-2) = (1,3)
start, end = pos, pos + 2
# revert strand
if not is_positive:
reference = Genomics.complement(reference)
variantseqs = [Genomics.complement(x.upper()) for x in variantseqs]
start, end = lcontig - end, lcontig - start
new_variants.append(ExtendedVariant._make((
start, end, reference.upper(), action, has_wildtype, variantseqs)))
return new_variants
示例10: main
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if not argv:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("-m", "--method", dest="method", type="choice",
choices=(
"apply",
"change-format",
"renumber-reads",
"sample",
"sort",
"trim3",
"trim5",
"unique",
"reverse-complement",
"grep"),
help="method to apply [%default]")
parser.add_option(
"--target-format", dest="target_format", type="choice",
choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
help="guess quality score format and set quality scores "
"to format [default=%default].")
parser.add_option(
"--guess-format", dest="guess_format", type="choice",
choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
help="quality score format to assume if ambiguous [default=%default].")
parser.add_option(
"--sample-size", dest="sample_size", type="float",
help="proportion of reads to sample. "
"Provide a proportion of reads to sample, e.g. 0.1 for 10%, "
"0.5 for 50%, etc [default=%default].")
parser.add_option(
"--pair-fastq-file", dest="pair", type="string",
help="if data is paired, filename with second pair. "
"Implemented for sampling [default=%default].")
parser.add_option(
"--map-tsv-file", dest="map_tsv_file", type="string",
help="filename with tab-separated identifiers mapping for "
"method apply [default=%default].")
parser.add_option(
"--num-bases", dest="nbases", type="int",
help="number of bases to trim [default=%default].")
parser.add_option(
"--seed", dest="seed", type="int",
help="seed for random number generator [default=%default].")
parser.add_option(
"--pattern-identifier", dest="renumber_pattern", type="string",
help="rename reads in file by pattern [default=%default]")
parser.add_option(
"--grep-pattern", dest="grep_pattern", type="string",
help="subset to reads matching pattern [default=%default]")
parser.set_defaults(
method=None,
change_format=None,
guess_format=None,
sample_size=0.1,
nbases=0,
pair=None,
apply=None,
seed=None,
renumber_pattern="read_%010i",
grep_pattern=".*")
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv, add_output_options=True)
c = E.Counter()
if options.method is None:
raise ValueError("no method specified, please use --method")
if options.method == "change-format":
for record in Fastq.iterate_convert(options.stdin,
format=options.target_format,
guess=options.guess_format):
c.input += 1
options.stdout.write("%s\n" % record)
c.output += 1
elif options.method == "grep":
for record in Fastq.iterate(options.stdin):
#.........这里部分代码省略.........
示例11: buildSequenceVariants
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def buildSequenceVariants(self, seq, strand, pos, snp):
'''build new sequence by modifying a sequence fragment in seq at
pos with snp.
It is assumed that seq is already oriented according to strand.
The strand is used to revert the snp if necessary.
Note that only sequences different from seq will be returned.
returns is_homozygous, seqs
'''
is_negative_strand = Genomics.IsNegativeStrand(strand)
reference_base = snp.reference_base
if reference_base != "*" and is_negative_strand:
reference_base = Genomics.complement(reference_base)
new_sequences = []
is_homozygous = True
if reference_base != "*":
if seq[pos].upper() != reference_base.upper():
raise ValueError("base mismatch at snp %i, expected %s, got %s in %s at position %i; snp=%s" %
(snp.pos, reference_base, seq[pos], seq, pos,
";".join(map(str, snp))))
# single base changes
variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)
if len(variant_bases) == 1:
is_homozygous = True
else:
is_homozygous = False
for variant_base in variant_bases:
if is_negative_strand:
variant_base = Genomics.complement(variant_base)
s = list(seq)
s[pos] = variant_base
s = "".join(s)
if s != seq:
new_sequences.append(s)
else:
variants = snp.genotype.split("/")
is_homozygous = False
for variant in variants:
s = list(seq)
# samtools denotes insert/deletion after position
# while python is before/at position, hence the pos+1
if variant[0] == "+":
toinsert = variant[1:].upper()
if is_negative_strand:
toinsert = Genomics.complement(toinsert)
s.insert(pos, toinsert)
else:
s.insert(pos + 1, toinsert)
elif variant[0] == "-":
# pos+1+len(x)-1 = pos+len(x)
todelete = variant[1:].upper()
l = len(todelete)
if is_negative_strand:
# delete left of pos
xstart = max(0, pos - l)
xend = pos
todelete = todelete[:min(l, pos)]
else:
# delete right of pos
xstart = pos + 1
xend = min(self.mSize, pos + 1 + l)
todelete = todelete[:self.mSize - (pos + 1)]
deleted = "".join(s[xstart:xend])
if is_negative_strand:
deleted = Genomics.complement(deleted)
if deleted != todelete:
raise ValueError("base mismatch at indel %i, expected %s, got %s in %s at position %i(%i:%i); is_negative_strand=%s, snp=%s" %
(snp.pos, todelete, deleted, seq, pos, xstart, xend,
is_negative_strand,
";".join(map(str, snp))))
del s[xstart:xend]
elif variant[0] == "*":
is_homozygous = True
else:
raise ValueError("unknown variant sign '%s'" % variant[0])
s = "".join(s)
if s != seq:
new_sequences.append(s)
return is_homozygous, new_sequences
示例12: getSequence
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def getSequence(self, contig, strand="+", start=0, end=0, converter=None, as_array=False):
"""get a genomic fragment.
A genomic fragment is identified by the coordinates
contig, strand, start, end.
The converter function supplied translated these coordinates
into 0-based coordinates. By default, start and end are assumed
to be pythonic coordinates and are forward/reverse coordinates.
If as_array is set to true, return the AString object. This might
be beneficial for large sequence chunks. If as_array is set to False,
return a python string.
"""
contig = self.getToken(contig)
data = self.mIndex[contig]
# dummy is
# -> pos_seq for seekable streams
# -> block_size for unseekable streams
try:
pos_id, dummy, lsequence = struct.unpack("QQi", data)
except (struct.error, TypeError):
pos_id, dummy, lsequence, points = data
pos_seq = dummy
block_size = dummy
if end == 0:
end = lsequence
if end > lsequence:
raise ValueError("3' coordinate on %s out of bounds: %i > %i" % (contig, end, lsequence))
if start < 0:
raise ValueError("5' coordinate on %s out of bounds: %i < 0" % (contig, start))
if converter:
first_pos, last_pos = converter(start, end, str(strand) in ("+", "1"), lsequence)
elif self.mConverter:
first_pos, last_pos = self.mConverter(start, end, str(strand) in ("+", "1"), lsequence)
else:
first_pos, last_pos = start, end
if str(strand) in ("-", "0", "-1"):
first_pos, last_pos = lsequence - last_pos, lsequence - first_pos
if first_pos == last_pos:
return ""
assert first_pos < last_pos, "first position %i is larger than last position %i " % (first_pos, last_pos)
p = AString()
if self.mNoSeek:
# read directly from position
p.fromstring(self.mDatabaseFile.read(block_size, data[3], first_pos, last_pos))
else:
first_pos += pos_seq
last_pos += pos_seq
self.mDatabaseFile.seek(first_pos)
p.fromstring(self.mDatabaseFile.read(last_pos - first_pos))
if str(strand) in ("-", "0", "-1"):
p = AString(Genomics.complement(str(p)))
if self.mTranslator:
return self.mTranslator.translate(p)
elif as_array:
return p
else:
if IS_PY3:
return p.tostring().decode("ascii")
else:
return p.tostring()
示例13: main
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if not argv:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("-m", "--method", dest="method", type="choice",
choices=('join', ),
help="method to apply [default=%default].")
parser.set_defaults(
method="join",
)
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
if len(args) != 2:
raise ValueError(
"please supply at least two fastq files on the commandline")
fn1, fn2 = args
c = E.Counter()
outfile = options.stdout
if options.method == "join":
# merge based on diagonals in dotplot
iter1 = Fastq.iterate(IOTools.openFile(fn1))
iter2 = Fastq.iterate(IOTools.openFile(fn2))
tuple_size = 2
for left, right in zip(iter1, iter2):
c.input += 1
# build dictionary of tuples
s1, q1 = left.seq, left.quals
d = collections.defaultdict(list)
for x in range(len(s1) - tuple_size):
d[s1[x:x + tuple_size]].append(x)
s2, q2 = right.seq, right.quals
# reverse complement
s2 = Genomics.complement(s2)
q2 = q2[::-1]
# compute list of offsets/diagonals
offsets = collections.defaultdict(int)
for x in range(len(s2) - tuple_size):
c = s2[x:x + tuple_size]
for y in d[c]:
offsets[x - y] += 1
# find maximum diagonal
sorted = sorted([(y, x) for x, y in offsets.items()])
max_count, max_offset = sorted[-1]
E.debug('%s: maximum offset at %i' % (left.identifier,
max_offset))
# simple merge sequence
take = len(s2) - max_offset
merged_seq = s1 + s2[take:]
# simple merge quality scores
merged_quals = q1 + q2[take:]
new_entry = copy.copy(left)
new_entry.seq = merged_seq
new_entry.quals = merged_quals
outfile.write(new_entry)
c.output += 1
# write footer and output benchmark information.
E.info("%s" % str(c))
E.Stop()
示例14: main
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if argv is None:
argv = sys.argv
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("--query-psl-file", dest="filename_query", type="string",
help="fasta filename with queries.")
parser.add_option("--target-psl-file", dest="filename_target", type="string",
help="fasta filename with target.")
parser.add_option("-m", "--method", dest="method", type="choice",
choices=(
"full", "pileup-query", "pileup-target", "gapless"),
help="method to use for constructing the alignment [%default].")
parser.add_option("--forward-query", dest="forward_query", action="store_true",
help="reverse-complement sequences such that query is always on forward strand [%default]")
parser.add_option("--target-prefix", dest="target_prefix", type="string",
help="prefix to use for target [%default].")
parser.add_option("--query-prefix", dest="query_prefix", type="string",
help="prefix to use for query [%default].")
parser.add_option("--id", dest="id", type="choice",
choices=("numeric", "query"),
help="choose type of identifier to use [%default]")
parser.set_defaults(
filename_query=None,
filename_target=None,
method="full",
output_format_id="%06i",
target_prefix="",
query_prefix="",
forward_query=False,
)
(options, args) = E.Start(parser)
if options.filename_query:
query = IndexedFasta.IndexedFasta(options.filename_query)
if options.filename_target:
target = IndexedFasta.IndexedFasta(options.filename_target)
if options.method == "full":
getAlignment = getAlignmentFull
id = 0
for match in Blat.iterator(options.stdin):
if options.loglevel >= 2:
options.stdout.write("# %s\n" % str(match))
m = match.getMapQuery2Target()
m.moveAlignment(-min(match.mQueryBlockStarts), -
min(match.mSbjctBlockStarts))
q = query.getSequence(
match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo)
t = target.getSequence(
match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)
query_ali, sbjct_ali = getAlignment(m, q, t, options)
if match.strand == "-" and options.forward_query:
query_ali = Genomics.complement(query_ali)
sbjct_ali = Genomics.complement(sbjct_ali)
options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" %
(options.query_prefix,
options.output_format_id % id,
match.mQueryId, match.mQueryFrom, match.mQueryTo,
query_ali,
options.target_prefix,
options.output_format_id % id,
match.mSbjctId, match.strand,
match.mSbjctFrom, match.mSbjctTo,
sbjct_ali))
id += 1
E.Stop()
示例15: len
# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
genome = IndexedFasta.IndexedFasta( options.genome_file )
assert options.filename_regions != None, "please supply a gff formatted filename with regions"
regions = GTF.readAsIntervals( GFF.iterator( IOTools.openFile(options.filename_regions, "r" ) ) )
# build pairs for complement
reverse_splice_pairs = []
forward_splice_pairs = options.splice_pairs
left_tokens, right_tokens = {}, {}
x = 0
for a,b in forward_splice_pairs:
assert len(a) == 2, "only two-residue patterns allowed"
assert len(b) == 2, "only two-residue patterns allowed"
ca, cb = Genomics.complement( a ), Genomics.complement( b )
reverse_splice_pairs.append( (b,a) )
left_tokens[a] = x
left_tokens[cb] = x+1
right_tokens[b] = x
right_tokens[ca] = x+1
x += 2
search_area = options.search_area
read_length = options.read_length
joined = options.joined
ninput, noutput = 0, 0
if joined:
outfile_coordinates = IOTools.openFile( options.output_filename_pattern % "coords", "w" )