本文整理汇总了Python中pysam.Samfile.fetch方法的典型用法代码示例。如果您正苦于以下问题:Python Samfile.fetch方法的具体用法?Python Samfile.fetch怎么用?Python Samfile.fetch使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pysam.Samfile
的用法示例。
在下文中一共展示了Samfile.fetch方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_barcode
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def parse_barcode(bamfile):
"""parses a sorted and index bam file, removes all cases where rna hits more than one spot in genome
and writes to a file, create file for mutant and wildtype based on barcodes"""
samfile = Samfile(bamfile, "rb")
multi_hit_file = Samfile("MultiHit.bam","wb",template=samfile)
mutant_one = Samfile("MutantOne.bam","wb",template=samfile)
wildtype_one = Samfile("WildtypeOne.bam","wb",template=samfile)
mutant_two = Samfile("MutantTwo.bam","wb",template=samfile)
wildtype_two = Samfile("WildtypeTwo.bam","wb",template=samfile)
for line in samfile.fetch():
#if line.is_secondary:
## does this hit to more than one spot in genome
# multi_hit_file.write(line)
if "#GAGT"in line.qname:
## write to mutant file
mutant_one.write(line)
elif "#TTAG" in line.qname:
mutant_two.write(line)
elif "#ACCC" in line.qname:
### write to wildtype file
wildtype_one.write(line)
elif "#CGTA" in line.qname:
### write to wildtype file
wildtype_two.write(line)
multi_hit_file.close()
mutant_one.close()
wildtype_one.close()
mutant_two.close()
wildtype_two.close()
samfile.close()
示例2: main
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def main(args):
option = "r" if args.samformat else "rb"
samfile = Samfile(args.bamfile, "rb")
#Iterates over each read instead of each contig
outputs = defaultdict(list)
#import ipdb; ipdb.set_trace()
for aln in samfile.fetch(until_eof = True):
ref = samfile.getrname(aln.tid)
outputs[ref].append(aln)
for ref, alns in outputs.iteritems():
print_reads(alns, ref, samfile.header)
示例3: get_raw_tracks
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def get_raw_tracks(args):
# Initializing Error Handler
err = ErrorHandler()
if len(args.input_files) != 2:
err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.")
output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix))
bam = Samfile(args.input_files[0], "rb")
regions = GenomicRegionSet("Interested regions")
regions.read(args.input_files[1])
regions.merge()
reads_file = GenomicSignal()
with open(output_fname, "a") as output_f:
for region in regions:
# Raw counts
signal = [0.0] * (region.final - region.initial)
for read in bam.fetch(region.chrom, region.initial, region.final):
if not read.is_reverse:
cut_site = read.pos + args.forward_shift
if region.initial <= cut_site < region.final:
signal[cut_site - region.initial] += 1.0
else:
cut_site = read.aend + args.reverse_shift - 1
if region.initial <= cut_site < region.final:
signal[cut_site - region.initial] += 1.0
if args.norm:
signal = reads_file.boyle_norm(signal)
perc = scoreatpercentile(signal, 98)
std = np.std(signal)
signal = reads_file.hon_norm_atac(signal, perc, std)
output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
"\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
output_f.close()
if args.bigWig:
genome_data = GenomeData(args.organism)
chrom_sizes_file = genome_data.get_chromosome_sizes()
bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix))
os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"]))
os.remove(output_fname)
示例4: main
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def main(args):
option = "r" if args.samformat else "rb"
samfile = Samfile(args.bamfile, "rb")
ref_ids = [samfile.gettid(r) for r in samfile.references]
#Iterates over each read instead of each contig
reads_to_print = []
for aln in samfile.fetch(until_eof = True):
if pair_is_aligned(aln, ref_ids):
if args.read_pair == 1 and aln.is_read1:
reads_to_print.append(aln)
elif args.read_pair == 2 and aln.is_read2:
reads_to_print.append(aln)
elif args.read_pair == 0:
reads_to_print.append(aln)
if len(reads_to_print) >= 10000:
# Flush the reads collected
print_reads(reads_to_print)
reads_to_print = []
print_reads(reads_to_print)
示例5: main
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def main(args):
option = "r" if args.samformat else "rb"
samfile = Samfile(args.bamfile, option)
ref_ids = [samfile.gettid(r) for r in samfile.references]
#Iterates over each read instead of each contig
reads_to_print_1 = []
reads_to_print_2 = []
reads_to_print_u = []
for aln in samfile.fetch(until_eof = True):
if aln.tid in ref_ids: # This read is aligned
if aln.rnext in ref_ids: # The mate is also aligned
if aln.is_read1:
reads_to_print_1.append(aln)
reads_to_print_1 = flush_reads(reads_to_print_1, args.R1)
elif aln.is_read2:
reads_to_print_2.append(aln)
reads_to_print_2 = flush_reads(reads_to_print_2, args.R2)
else:
reads_to_print_u.append(aln)
reads_to_print_u = flush_reads(reads_to_print_u, args.u)
print_reads(reads_to_print_1, args.R1)
print_reads(reads_to_print_2, args.R2)
print_reads(reads_to_print_u, args.u)
示例6: ace
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def ace(args):
"""
%prog ace bamfile fastafile
convert bam format to ace format. This often allows the remapping to be
assessed as a denovo assembly format. bam file needs to be indexed. also
creates a .mates file to be used in amos/bambus, and .astat file to mark
whether the contig is unique or repetitive based on A-statistics in Celera
assembler.
"""
p = OptionParser(ace.__doc__)
p.add_option("--splitdir", dest="splitdir", default="outRoot",
help="split the ace per contig to dir [default: %default]")
p.add_option("--unpaired", dest="unpaired", default=False,
help="remove read pairs on the same contig [default: %default]")
p.add_option("--minreadno", dest="minreadno", default=3, type="int",
help="minimum read numbers per contig [default: %default]")
p.add_option("--minctgsize", dest="minctgsize", default=100, type="int",
help="minimum contig size per contig [default: %default]")
p.add_option("--astat", default=False, action="store_true",
help="create .astat to list repetitiveness [default: %default]")
p.add_option("--readids", default=False, action="store_true",
help="create file of mapped and unmapped ids [default: %default]")
from pysam import Samfile
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
bamfile, fastafile = args
astat = opts.astat
readids = opts.readids
f = Fasta(fastafile)
prefix = bamfile.split(".")[0]
acefile = prefix + ".ace"
readsfile = prefix + ".reads"
astatfile = prefix + ".astat"
logging.debug("Load {0}".format(bamfile))
s = Samfile(bamfile, "rb")
ncontigs = s.nreferences
genomesize = sum(x for a, x in f.itersizes())
logging.debug("Total {0} contigs with size {1} base".format(ncontigs,
genomesize))
qual = "20" # default qual
totalreads = sum(s.count(x) for x in s.references)
logging.debug("Total {0} reads mapped".format(totalreads))
fw = open(acefile, "w")
if astat:
astatfw = open(astatfile, "w")
if readids:
readsfw = open(readsfile, "w")
print >> fw, "AS {0} {1}".format(ncontigs, totalreads)
print >> fw
for i, contig in enumerate(s.references):
cseq = f[contig]
nbases = len(cseq)
mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
nreads = len(mapped_reads)
nsegments = 0
print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads,
nsegments)
print >> fw, fill(str(cseq.seq))
print >> fw
if astat:
astat = Astat(nbases, nreads, genomesize, totalreads)
print >> astatfw, "{0}\t{1:.1f}".format(contig, astat)
text = fill([qual] * nbases, delimiter=" ", width=30)
print >> fw, "BQ\n{0}".format(text)
print >> fw
rnames = []
for a in mapped_reads:
readname = a.qname
rname = readname
if readids:
print >> readsfw, readname
rnames.append(rname)
strand = "C" if a.is_reverse else "U"
paddedstart = a.pos + 1 # 0-based to 1-based
af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
print >> fw, af
print >> fw
for a, rname in zip(mapped_reads, rnames):
#.........这里部分代码省略.........
示例7: get_raw_signal
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def get_raw_signal(arguments):
(mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
window_size, forward_shift, reverse_shift) = arguments
mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
mpbs1.read(mpbs_file1)
mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
mpbs2.read(mpbs_file2)
mpbs = mpbs1.combine(mpbs2, output=True)
mpbs.sort()
bam1 = Samfile(reads_file1, "rb")
bam2 = Samfile(reads_file2, "rb")
genome_data = GenomeData(organism)
fasta = Fastafile(genome_data.get_genome())
signal_1 = np.zeros(window_size)
signal_2 = np.zeros(window_size)
motif_len = None
pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
("G", [0.0] * window_size), ("T", [0.0] * window_size),
("N", [0.0] * window_size)])
mpbs_regions = mpbs.by_names([mpbs_name])
num_motif = len(mpbs_regions)
for region in mpbs_regions:
if motif_len is None:
motif_len = region.final - region.initial
mid = (region.final + region.initial) / 2
p1 = mid - window_size / 2
p2 = mid + window_size / 2
if p1 <= 0:
continue
# Fetch raw signal
for read in bam1.fetch(region.chrom, p1, p2):
# check if the read is unmapped, according to issue #112
if read.is_unmapped:
continue
if not read.is_reverse:
cut_site = read.pos + forward_shift
if p1 <= cut_site < p2:
signal_1[cut_site - p1] += 1.0
else:
cut_site = read.aend + reverse_shift - 1
if p1 <= cut_site < p2:
signal_1[cut_site - p1] += 1.0
for read in bam2.fetch(region.chrom, p1, p2):
# check if the read is unmapped, according to issue #112
if read.is_unmapped:
continue
if not read.is_reverse:
cut_site = read.pos + forward_shift
if p1 <= cut_site < p2:
signal_2[cut_site - p1] += 1.0
else:
cut_site = read.aend + reverse_shift - 1
if p1 <= cut_site < p2:
signal_2[cut_site - p1] += 1.0
update_pwm(pwm, fasta, region, p1, p2)
return signal_1, signal_2, motif_len, pwm, num_motif
示例8: open
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
intFile = open(intFileName,"r")
spr = 0.0
counter = 0.0
for line in intFile:
# Fetching signal
ll = line.strip().split("\t")
mLen = int(ll[2]) - int(ll[1])
mid = (int(ll[1])+int(ll[2]))/2
p1 = max(mid - halfWindow,0)
p2 = mid + halfWindow
# Fetch raw signal
pileup_region = PileupRegion(p1,p2,1)
if(ps_version == "0.7.5"):
bam.fetch(reference=ll[0], start=p1, end=p2, callback = pileup_region)
else:
iter = bam.fetch(reference=ll[0], start=p1, end=p2)
for alignment in iter: pileup_region.__call__(alignment)
raw_signal = array([min(e,initial_clip) for e in pileup_region.vector])
# Std-based clipping
mean = raw_signal.mean()
std = raw_signal.std()
clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]
# Bias Correction
correctedSignal = bias_correction(bam, clip_signal, biasTableF, biasTableR, genomeFileName, ll[0], p1, p2)
# Summing min value to signal
stdzSignal = [e+minValue for e in correctedSignal]
示例9: __init__
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
class GenomicSignal:
"""
Represents a genomic signal. It should be used to fetch normalized and slope
signals from a bam file.
Usage:
1. Initialize class.
2. Call load_sg_coefs once.
3. Call get_signal as many times as needed.
Authors: Eduardo G. Gusmao.
"""
def __init__(self, file_name=None):
"""
Initializes GenomicSignal.
"""
self.file_name = file_name
self.sg_coefs = None
if file_name is not None:
self.bam = Samfile(file_name, "rb")
def load_sg_coefs(self, slope_window_size):
"""
Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.
Keyword arguments:
slope_window_size -- Window size of Savitzky-Golay coefficients.
Return:
None -- It updates self.sg_coefs.
"""
self.sg_coefs = self.savitzky_golay_coefficients(slope_window_size, 2, 1)
def get_tag_count(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
initial_clip=1000):
"""
Gets the tag count associated with self.bam based on start, end and ext.
Keyword arguments:
ref -- Chromosome name.
start -- Initial genomic coordinate of signal.
end -- Final genomic coordinate of signal.
downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand).
upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand).
forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region.
reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read).
initial_clip -- Signal will be initially clipped at this level to avoid outliers.
Return:
tag_count -- Total signal.
"""
# Fetch raw signal
pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
if ps_version == "0.7.5":
self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
else:
iter = self.bam.fetch(reference=ref, start=start, end=end)
for alignment in iter: pileup_region.__call__(alignment)
raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])
# Tag count
try:
tag_count = sum(raw_signal)
except Exception:
tag_count = 0
return tag_count
def get_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
initial_clip=1000, per_norm=98, per_slope=98,
bias_table=None, genome_file_name=None, print_raw_signal=False):
"""
Gets the signal associated with self.bam based on start, end and ext.
initial_clip, per_norm and per_slope are used as normalization factors during the normalization
and slope evaluation procedures.
Keyword arguments:
ref -- Chromosome name.
start -- Initial genomic coordinate of signal.
end -- Final genomic coordinate of signal.
initial_clip -- Signal will be initially clipped at this level to avoid outliers.
per_norm -- Percentile value for 'hon_norm' function of the normalized signal.
per_slope -- Percentile value for 'hon_norm' function of the slope signal.
bias_table -- Bias table to perform bias correction.
genome_file_name -- Genome to perform bias correction.
downstream_ext -- Number of bps to extend towards the downstream region
(right for forward strand and left for reverse strand).
upstream_ext -- Number of bps to extend towards the upstream region
(left for forward strand and right for reverse strand).
forward_shift -- Number of bps to shift the reads aligned to the forward strand.
Can be a positive number for a shift towards the downstream region
(towards the inside of the aligned read) and a negative number for a shift towards the upstream region.
reverse_shift -- Number of bps to shift the reads aligned to the reverse strand.
Can be a positive number for a shift towards the upstream region and a negative number
for a shift towards the downstream region (towards the inside of the aligned read).
Return:
hon_signal -- Normalized signal.
slopehon_signal -- Slope signal.
#.........这里部分代码省略.........
示例10: defaultdict
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
if(not os.path.exists(args.outdir)):
os.makedirs(args.outdir)
#________________________________________________________________________________________________________________
#get error dict;
#________________________________________________________________________________________________________________
errors = defaultdict(set)
with open(args.errors, 'r') as f:
for l in f:
a = l.strip().split("\t");
if(not args.etype or ",".join(a[:2]) in args.etype):
errors[tuple(a[:2])].add(a[2]);
errors2segments = defaultdict(lambda: defaultdict(list));
samfile = Samfile(args.path)
for segment in samfile.fetch(until_eof=True):
num = segment.query_name.split("|")[0]
for etype, eset in errors.iteritems():
if(num in eset):
errors2segments[etype][num].append(segment);
break;
additional = defaultdict(list);
for fname in args.additional:
tsamfile = Samfile(fname);
for segment in tsamfile.fetch(until_eof=True):
num = segment.query_name.split("|")[0]
additional[num].append(ArWrapper(segment, tsamfile.getrname(segment.tid)))
tsamfile.close();
示例11: estimate_table
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift):
"""
Estimates bias based on HS regions, DNase-seq signal and genomic sequences.
Keyword arguments:
regions -- DNase-seq HS regions.
dnase_file_name -- DNase-seq file name.
genome_file_name -- Genome to fetch genomic sequences from.
Return:
bias_table_F, bias_table_R -- Bias tables.
"""
# Parameters
maxDuplicates = 100
pseudocount = 1.0
# Initializing bam and fasta
if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR
bamFile = Samfile(dnase_file_name, "rb")
fastaFile = Fastafile(genome_file_name)
# Initializing dictionaries
obsDictF = dict(); obsDictR = dict()
expDictF = dict(); expDictR = dict()
ct_reads_r=0
ct_reads_f=0
ct_kmers=0
# Iterating on HS regions
for region in regions:
# Initialization
prevPos = -1
trueCounter = 0
# Evaluating observed frequencies ####################################
# Fetching reads
for r in bamFile.fetch(region.chrom, region.initial, region.final):
# Calculating positions
if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
else: p1 = r.aend - (k_nb/2) + 1 - shift
p2 = p1 + k_nb
# Verifying PCR artifacts
if(p1 == prevPos): trueCounter += 1
else:
prevPos = p1
trueCounter = 0
if(trueCounter > maxDuplicates): continue
# Fetching k-mer
try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
except Exception: continue
if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr)
# Counting k-mer in dictionary
if(not r.is_reverse):
ct_reads_r+=1
try: obsDictF[currStr] += 1
except Exception: obsDictF[currStr] = 1
else:
ct_reads_f+=1
try: obsDictR[currStr] += 1
except Exception: obsDictR[currStr] = 1
# Evaluating expected frequencies ####################################
# Fetching whole sequence
try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
except Exception: continue
currRevComp = AuxiliaryFunctions.revcomp(currStr)
# Iterating on each sequence position
for i in range(0,len(currStr)-k_nb):
ct_kmers+=1
# Counting k-mer in dictionary
s = currStr[i:i+k_nb]
try: expDictF[s] += 1
except Exception: expDictF[s] = 1
# Counting k-mer in dictionary for reverse complement
s = currRevComp[i:i+k_nb]
try: expDictR[s] += 1
except Exception: expDictR[s] = 1
# Closing files
bamFile.close()
fastaFile.close()
# Creating bias dictionary
alphabet = ["A","C","G","T"]
kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
bias_table_F = dict([(e,0.0) for e in kmerComb])
bias_table_R = dict([(e,0.0) for e in kmerComb])
for kmer in kmerComb:
#.........这里部分代码省略.........
示例12: estimate_bias_kmer
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def estimate_bias_kmer(args):
# Parameters
maxDuplicates = 100
pseudocount = 1.0
# Initializing bam and fasta
bamFile = Samfile(args.reads_file, "rb")
genome_data = GenomeData(args.organism)
fastaFile = Fastafile(genome_data.get_genome())
regions = GenomicRegionSet("regions")
regions.read(args.regions_file)
# Initializing dictionaries
obsDictF = dict()
obsDictR = dict()
expDictF = dict()
expDictR = dict()
ct_reads_r = 0
ct_reads_f = 0
ct_kmers = 0
# Iterating on HS regions
for region in regions:
# Initialization
prevPos = -1
trueCounter = 0
# Evaluating observed frequencies ####################################
# Fetching reads
for r in bamFile.fetch(region.chrom, region.initial, region.final):
# Calculating positions
if not r.is_reverse:
cut_site = r.pos + args.forward_shift - 1
p1 = cut_site - int(floor(args.k_nb / 2))
else:
cut_site = r.aend + args.reverse_shift + 1
p1 = cut_site - int(floor(args.k_nb / 2))
p2 = p1 + args.k_nb
# Verifying PCR artifacts
if p1 == prevPos:
trueCounter += 1
else:
prevPos = p1
trueCounter = 0
if trueCounter > maxDuplicates: continue
# Fetching k-mer
try:
currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
except Exception:
continue
if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)
# Counting k-mer in dictionary
if not r.is_reverse:
ct_reads_f += 1
try:
obsDictF[currStr] += 1
except Exception:
obsDictF[currStr] = 1
else:
ct_reads_r += 1
try:
obsDictR[currStr] += 1
except Exception:
obsDictR[currStr] = 1
# Evaluating expected frequencies ####################################
# Fetching whole sequence
try:
currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
except Exception:
continue
currRevComp = AuxiliaryFunctions.revcomp(currStr)
# Iterating on each sequence position
for i in range(0, len(currStr) - args.k_nb):
ct_kmers += 1
# Counting k-mer in dictionary
s = currStr[i:i + args.k_nb]
try:
expDictF[s] += 1
except Exception:
expDictF[s] = 1
# Counting k-mer in dictionary for reverse complement
s = currRevComp[i:i + args.k_nb]
try:
expDictR[s] += 1
except Exception:
expDictR[s] = 1
# Closing files
bamFile.close()
fastaFile.close()
#.........这里部分代码省略.........
示例13: create_signal
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def create_signal(args, regions):
def revcomp(s):
rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")])
return "".join([rev_dict[e] for e in s[::-1]])
alphabet = ["A", "C", "G", "T"]
kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
f_obs_dict = dict([(e, 0.0) for e in kmer_comb])
r_obs_dict = dict([(e, 0.0) for e in kmer_comb])
f_exp_dict = dict([(e, 0.0) for e in kmer_comb])
r_exp_dict = dict([(e, 0.0) for e in kmer_comb])
bam_file = Samfile(args.reads_file, "rb")
genome_data = GenomeData(args.organism)
fasta_file = Fastafile(genome_data.get_genome())
for region in regions:
# Fetching observed reads
reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final)
for read in reads:
if not read.is_reverse:
p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1
else:
p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1
p2 = p1 + args.k_nb
try:
dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper()
except Exception:
continue
if 'N' not in dna_sequence_obs:
if read.is_reverse:
dna_sequence_obs = revcomp(dna_sequence_obs)
r_obs_dict[dna_sequence_obs] += 1
else:
f_obs_dict[dna_sequence_obs] += 1
# Fetching whole sequence
try:
dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper()
except Exception:
continue
dna_sequence_exp_rev = revcomp(dna_sequence_exp)
for i in range(0, len(dna_sequence_exp) - args.k_nb):
s = dna_sequence_exp[i:i + args.k_nb]
if "N" not in s:
f_exp_dict[s] += 1
s = dna_sequence_exp_rev[i:i + args.k_nb]
if "N" not in s:
r_exp_dict[s] += 1
output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb)))
output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb)))
output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb)))
output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb)))
output_file_f_obs = open(output_fname_f_obs, "w")
output_file_f_exp = open(output_fname_f_exp, "w")
output_file_r_obs = open(output_fname_r_obs, "w")
output_file_r_exp = open(output_fname_r_exp, "w")
for kmer in r_obs_dict.keys():
if f_obs_dict[kmer] > 0:
output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n")
for kmer in r_obs_dict.keys():
if f_exp_dict[kmer] > 0:
output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n")
for kmer in r_obs_dict.keys():
if r_obs_dict[kmer] > 0:
output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n")
for kmer in r_obs_dict.keys():
if r_exp_dict[kmer] > 0:
output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n")
output_file_f_obs.close()
output_file_f_exp.close()
output_file_r_obs.close()
output_file_r_exp.close()
示例14: estimate_bias_pwm
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def estimate_bias_pwm(args):
# Parameters
max_duplicates = 100
# Initializing bam and fasta
bamFile = Samfile(args.reads_file, "rb")
genome_data = GenomeData(args.organism)
fastaFile = Fastafile(genome_data.get_genome())
regions = GenomicRegionSet("regions")
regions.read(args.regions_file)
obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
# Iterating on HS regions
for region in regions:
# Initialization
prev_pos = -1
true_counter = 0
# Evaluating observed frequencies
# Fetching reads
for r in bamFile.fetch(region.chrom, region.initial, region.final):
# Calculating positions
if not r.is_reverse:
cut_site = r.pos + args.forward_shift - 1
p1 = cut_site - int(floor(args.k_nb / 2))
else:
cut_site = r.aend + args.reverse_shift + 1
p1 = cut_site - int(floor(args.k_nb / 2))
p2 = p1 + args.k_nb
# Verifying PCR artifacts
if p1 == prev_pos:
true_counter += 1
else:
prev_pos = p1
true_counter = 0
if true_counter > max_duplicates: continue
# Fetching k-mer
try:
currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
except Exception:
continue
if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)
# Counting k-mer in dictionary
if not r.is_reverse:
for i in range(0, len(currStr)):
obs_f_pwm_dict[currStr[i]][i] += 1
else:
for i in range(0, len(currStr)):
obs_r_pwm_dict[currStr[i]][i] += 1
# Evaluating expected frequencies
# Fetching whole sequence
try:
currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
except Exception:
continue
# Iterating on each sequence position
s = None
for i in range(0, len(currStr) - args.k_nb):
# Counting k-mer in dictionary
s = currStr[i:i + args.k_nb]
for i in range(0, len(s)):
exp_f_pwm_dict[s[i]][i] += 1
# Counting k-mer in dictionary for reverse complement
s = AuxiliaryFunctions.revcomp(s)
for i in range(0, len(s)):
exp_r_pwm_dict[s[i]][i] += 1
# Closing files
bamFile.close()
fastaFile.close()
# Output pwms
os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
pwm_file_list = []
pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))
pwm_file_list.append(pwm_obs_f)
pwm_file_list.append(pwm_obs_r)
pwm_file_list.append(pwm_exp_f)
pwm_file_list.append(pwm_exp_r)
for i in range(len(pwm_dict_list)):
#.........这里部分代码省略.........
示例15: load_sg_coefs
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
class GenomicSignal:
"""
Represents a genomic signal. It should be used to fetch normalized and slope
signals from a bam or bw file.
Usage:
1. Initialize class.
2. Call load_sg_coefs once.
3. Call get_signal as many times as needed.
Authors: Eduardo G. Gusmao.
Methods:
load_sg_coefs(self, slope_window_size):
Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.
get_signal(self, ref, start, end, ext, initial_clip = 1000, per_norm = 98, per_slope = 98)
Gets the signal associated with self.bam or self.bw based on start, end and ext.
initial_clip, per_norm and per_slope are used as normalization factors during the normalization
and slope evaluation procedures.
hon_norm(self, sequence, mean, std):
Normalizes a sequence according to hon's criterion using mean and std.
This represents a between-dataset normalization.
boyle_norm(self, sequence):
Normalizes a sequence according to Boyle's criterion.
This represents a within-dataset normalization.
savitzky_golay_coefficients(self, window_size, order, deriv):
Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal.
It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed).
slope(self, sequence, sg_coefs):
Evaluates the slope of sequence given the sg_coefs loaded.
"""
def __init__(self, file_name):
"""
Initializes GenomicSignal.
"""
self.file_name = file_name
self.bam = None
self.bw = None
self.sg_coefs = None
self.is_bam = False
self.is_bw = False
if(self.file_name.split(".")[-1].upper() == "BAM"):
self.is_bam = True
self.bam = Samfile(file_name,"rb")
elif(self.file_name.split(".")[-1].upper() == "BW" or self.file_name.split(".")[-1].upper() == "BIGWIG"):
self.is_bw = True
self.bw = BigWigFile(file_name)
else: pass # TODO ERROR
def load_sg_coefs(self, slope_window_size):
"""
Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.
Keyword arguments:
slope_window_size -- Window size of Savitzky-Golay coefficients.
Return:
None -- It updates self.sg_coefs.
"""
self.sg_coefs = self.savitzky_golay_coefficients(slope_window_size, 2, 1)
def get_tag_count(self, ref, start, end, ext, initial_clip = 1000, ext_both_directions=False):
"""
Gets the tag count associated with self.bam based on start, end and ext.
Keyword arguments:
ref -- Chromosome name.
start -- Initial genomic coordinate of signal.
end -- Final genomic coordinate of signal.
ext -- Fragment extention. Eg. 1 for DNase and 200 for histone modifications.
initial_clip -- Signal will be initially clipped at this level to avoid outliers.
Return:
tag_count -- Total signal.
"""
# Fetch raw signal
pileup_region = PileupRegion(start,end,ext)
if(self.is_bam):
if(ps_version == "0.7.5"):
self.bam.fetch(reference=ref, start=start, end=end, callback = pileup_region)
else:
iter = self.bam.fetch(reference=ref, start=start, end=end)
if(not ext_both_directions):
for alignment in iter: pileup_region.__call__(alignment)
else:
for alignment in iter: pileup_region.__call2__(alignment)
raw_signal = array([min(e,initial_clip) for e in pileup_region.vector])
elif(self.is_bw):
signal = self.bw.pileup(ref, start, end)
raw_signal = array([min(e,initial_clip) for e in signal])
# Std-based clipping
mean = raw_signal.mean()
#.........这里部分代码省略.........