本文整理汇总了Python中pysam.Fastafile方法的典型用法代码示例。如果您正苦于以下问题:Python pysam.Fastafile方法的具体用法?Python pysam.Fastafile怎么用?Python pysam.Fastafile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pysam
的用法示例。
在下文中一共展示了pysam.Fastafile方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: filter_seqlets
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def filter_seqlets(seqlet_acts, seqlet_intervals, genome_fasta_file, end_distance=100, verbose=True):
""" Filter seqlets by valid chromosome coordinates. """
# read chromosome lengths
chr_lengths = {}
genome_fasta_open = pysam.Fastafile(genome_fasta_file)
for chrom in genome_fasta_open.references:
chr_lengths[chrom] = genome_fasta_open.get_reference_length(chrom)
genome_fasta_open.close()
# check coordinates
filter_mask = np.zeros(len(seqlet_intervals), dtype='bool')
for si, seq_int in enumerate(seqlet_intervals):
left_valid = (seq_int.start > end_distance)
right_valid = (seq_int.end + end_distance < chr_lengths[seq_int.chr])
filter_mask[si] = left_valid and right_valid
if verbose:
print('Removing %d seqlets near chromosome ends.' % (len(seqlet_intervals) - filter_mask.sum()))
# filter
seqlet_acts = seqlet_acts[filter_mask]
seqlet_intervals = [seq_int for si, seq_int in enumerate(seqlet_intervals) if filter_mask[si]]
return seqlet_acts, seqlet_intervals
示例2: load_chromosomes
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def load_chromosomes(genome_file):
""" Load genome segments from either a FASTA file or
chromosome length table. """
# is genome_file FASTA or (chrom,start,end) table?
file_fasta = (open(genome_file).readline()[0] == '>')
chrom_segments = {}
if file_fasta:
fasta_open = pysam.Fastafile(genome_file)
for i in range(len(fasta_open.references)):
chrom_segments[fasta_open.references[i]] = [(0, fasta_open.lengths[i])]
fasta_open.close()
else:
for line in open(genome_file):
a = line.split()
chrom_segments[a[0]] = [(0, int(a[1]))]
return chrom_segments
示例3: test_seqs
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def test_seqs(self):
"""Test that the one hot coded sequences match."""
# read sequence coordinates
seqs_bed_file = '%s/sequences.bed' % self.out_dir
seq_coords = read_seq_coords(seqs_bed_file)
# read one hot coding from TF Records
train_tfrs_str = '%s/tfrecords/train-0.tfr' % self.out_dir
seqs_1hot, _ = self.read_tfrecords(train_tfrs_str)
# open FASTA
fasta_open = pysam.Fastafile(self.fasta_file)
# check random sequences
seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32)
for si in seq_indexes:
sc = seq_coords[si]
seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end)
seq_1hot_dna = hot1_dna(seqs_1hot[si])
self.assertEqual(seq_fasta, seq_1hot_dna)
示例4: match_database
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def match_database(args):
"""Match a genome to a database of alleles."""
refs = Fastafile(expanduser(args.reference))
db = VariantFile(expanduser(args.database))
sample = VariantFile(expanduser(args.sample))
format_meta, info_meta = build_new_metadata(db, sample)
with VariantFile(args.output, 'w', header=sample.header) as out:
for superlocus, matches in generate_matches(refs, sample, db, args):
for allele_locus, allele, match in matches:
# Annotate results of search
status, times = translate_match(match)
suffix = '_' + status
for locus in allele_locus:
annotate_info(locus, allele, info_meta, suffix, times)
annotate_format(locus, allele, format_meta, suffix, times)
for locus in sorted(superlocus, key=NormalizedLocus.record_order_key):
out.write(locus.record)
示例5: sufficient_sequence
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def sufficient_sequence(fasta_file, genes_df, seq_length, n_allowed_pct):
"""Return boolean mask specifying genes with sufficient sequence."""
# open FASTA
fasta_open = pysam.Fastafile(fasta_file)
# initialize gene boolean
gene_valid = np.ones(genes_df.shape[0], dtype='bool')
gi = 0
for gene in genes_df.itertuples():
chr_len = fasta_open.get_reference_length(gene.chr)
mid_pos = (gene.start + gene.end) // 2
seq_start = mid_pos - seq_length//2
seq_end = seq_start + seq_length
# count requested N's
n_requested = 0
if seq_start < 0:
n_requested += -seq_start
if seq_end > chr_len:
n_requested += seq_end - chr_len
if n_requested > 0:
if n_requested/seq_length < n_allowed_pct:
print('Allowing %s with %d Ns' % (gene.Index, n_requested))
else:
print('Skipping %s with %d Ns' % (gene.Index, n_requested))
gene_valid[gi] = False
gi += 1
fasta_open.close()
return gene_valid
示例6: test_seqs
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def test_seqs(self):
"""Test that the one hot coded sequences match."""
for gi in range(2):
# read sequence coordinates
seqs_bed_file = '%s/sequences%d.bed' % (self.out_dir, gi)
seq_coords = read_seq_coords(seqs_bed_file)
# read one hot coding from TF Records
train_tfrs_str = '%s/tfrecords/train-%d-0.tfr' % (self.out_dir, gi)
seqs_1hot, _, genomes = self.read_tfrecords(train_tfrs_str)
# check genome
self.assertEqual(len(np.unique(genomes)), 1)
self.assertEqual(genomes[0], gi)
# open FASTA
fasta_open = pysam.Fastafile(self.fasta_files[gi])
# check random sequences
seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32)
for si in seq_indexes:
sc = seq_coords[si]
seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end).upper()
seq_1hot_dna = hot1_dna(seqs_1hot[si])
self.assertEqual(seq_fasta, seq_1hot_dna)
示例7: normalize
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def normalize(args):
"""Normalize variants."""
refs = Fastafile(expanduser(args.reference))
variants = VariantFile(args.sample)
with VariantFile(args.output, 'w', header=variants.header) as out:
# Create parallel locus iterator by chromosome
for _, ref, loci in records_by_chromosome(refs, [variants], [None], args):
loci = sort_almost_sorted(loci[0], key=NormalizedLocus.left_order_key)
for locus in loci:
record = locus.record
start = locus.left.start
stop = locus.left.stop
alleles = locus.left.alleles
if '' in alleles:
pad = ref[start - 1:start]
start -= 1
alleles = [pad + a for a in alleles]
record.alleles = alleles
record.start = start
record.stop = stop
out.write(record)
示例8: convert_svtool_to_vcf
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def convert_svtool_to_vcf(file_name, sample, out_vcf, toolname, reference, sort=False, index=False):
vcf_template_reader = get_template()
vcf_template_reader.samples = [sample]
vcf_fd = open(out_vcf, "w") if out_vcf is not None else sys.stdout
vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader)
reference_handle = pysam.Fastafile(reference) if reference else None
reference_contigs = get_contigs(reference)
if sort:
if not reference_contigs:
logger.warn("Chromosomes will be sorted in lexicographic order since reference is missing")
else:
logger.info("Chromosomes will be sorted by the reference order")
vcf_records = []
for tool_record in tool_to_reader[toolname](file_name, reference_handle=reference_handle):
vcf_record = tool_record.to_vcf_record(sample)
if vcf_record is None:
continue
if sort:
vcf_records.append(vcf_record)
else:
vcf_writer.write_record(vcf_record)
if sort:
if reference_contigs:
contigs_order_dict = {contig.name: index for (index, contig) in enumerate(reference_contigs)}
vcf_records.sort(
cmp=lambda x, y: cmp((contigs_order_dict[x.CHROM], x.POS), (contigs_order_dict[y.CHROM], y.POS)))
else:
vcf_records.sort(cmp=lambda x, y: cmp((x.CHROM, x.POS), (y.CHROM, y.POS)))
for vcf_record in vcf_records:
vcf_writer.write_record(vcf_record)
vcf_writer.close()
if out_vcf and index:
pysam.tabix_index(out_vcf, force=True, preset='vcf')
示例9: segments_1hot
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def segments_1hot(fasta_file, segments, seq_length, stride):
""" Read and 1-hot code sequences in their segment batches.
Args
fasta_file: FASTA genome
segments: list of (chrom,start,end) genomic segments to read
seq_length: sequence length to break them into
stride: distance to advance each sequence
Returns:
seqs_1hot: You know.
seqs_segments: list of (chrom,start,end) sequence segments
"""
# open fasta
fasta = pysam.Fastafile(fasta_file)
# initialize 1-hot coding list
seqs_1hot = []
# segment corresponding to each sequence
seqs_segments = []
for chrom, seg_start, seg_end in segments:
# read sequence
seg_seq = fasta.fetch(chrom, seg_start, seg_end)
# break up into batchable sequences (as above in bigwig_batch)
bstart = 0
bend = bstart + seq_length
while bend < len(seg_seq):
# append
seqs_1hot.append(dna_io.dna_1hot(seg_seq[bstart:bend]))
seqs_segments.append((chrom, seg_start + bstart, seg_start + bend))
# update
bstart += stride
bend += stride
return np.array(seqs_1hot), seqs_segments
################################################################################
示例10: segments_1hot
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def segments_1hot(fasta_file, segments, seq_length, stride):
""" Read and 1-hot code sequences in their segment batches.
Args
fasta_file: FASTA genome
segments: list of (chrom,start,end) genomic segments to read
seq_length: sequence length to break them into
Returns:
seqs_1hot: You know.
seqs_segments: list of (chrom,start,end) sequence segments
"""
# open fasta
fasta = pysam.Fastafile(fasta_file)
# initialize 1-hot coding list
seqs_1hot = []
# segment corresponding to each sequence
seqs_segments = []
for chrom, seg_start, seg_end in segments:
# read sequence
seg_seq = fasta.fetch(chrom, seg_start, seg_end)
# break up into batchable sequences (as above in bigwig_batch)
bstart = 0
bend = bstart + seq_length
while bend < len(seg_seq):
# append
seqs_1hot.append(dna_io.dna_1hot(seg_seq[bstart:bend]))
seqs_segments.append((chrom, seg_start + bstart, seg_start + bend))
# update
bstart += stride
bend += stride
return np.array(seqs_1hot), seqs_segments
################################################################################
示例11: __init__
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def __init__(self,
chrom_lengths,
stranded=False,
smooth_sd=32,
clip_max=None,
clip_max_multi=2,
shift_center=False,
shift_forward=0,
shift_reverse=0,
mapq_t=1,
fasta_file=None):
self.stranded = stranded
if self.stranded:
# model + and - strand of each chromosome
self.chrom_lengths = OrderedDict()
for chrom, clen in chrom_lengths.items():
self.chrom_lengths['%s+'%chrom] = clen
for chrom, clen in chrom_lengths.items():
self.chrom_lengths['%s-'%chrom] = clen
else:
self.chrom_lengths = chrom_lengths
self.genome_length = sum(self.chrom_lengths.values())
self.unique_counts = np.zeros(self.genome_length, dtype='uint8')
self.active_blocks = None
self.smooth_sd = smooth_sd
self.shift_center = shift_center
self.shift_forward = shift_forward
self.shift_reverse = shift_reverse
self.clip_max = clip_max
self.clip_max_multi = clip_max_multi
self.adaptive_cdf = 0.01
self.adaptive_t = {}
self.mapq_t = mapq_t
self.fasta = None
if fasta_file is not None:
self.fasta = pysam.Fastafile(fasta_file)
self.gc_model = None
示例12: main
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def main():
usage = 'usage: %prog [options] <hdf5_file> <bed_file>'
parser = OptionParser(usage)
parser.add_option(
'-f',
dest='fasta_file',
default='%s/assembly/hg19.fa' % os.environ['HG19'],
help='FASTA file [Default: %default]')
parser.add_option(
'-n',
dest='check_num',
default=100,
type='int',
help='Number of sequences to check [Default: %default]')
(options, args) = parser.parse_args()
if len(args) != 2:
parser.error('Must provide HDF5 and BED files')
else:
hdf5_file = args[0]
bed_file = args[1]
fasta = pysam.Fastafile(options.fasta_file)
hdf5_in = h5py.File(hdf5_file)
si = 0
for line in open(bed_file):
a = line.split()
if a[-1] == 'train':
chrom = a[0]
start = int(a[1])
end = int(a[2])
bed_seq = fasta.fetch(chrom, start, end).upper()
hdf5_seq = basenji.dna_io.hot1_dna(hdf5_in['train_in'][si:si + 1])[0]
print(bed_seq[:10], len(bed_seq))
assert (bed_seq == hdf5_seq)
si += 1
if si > options.check_num:
break
################################################################################
# __main__
################################################################################
示例13: match_database2
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def match_database2(args):
"""Match a genome to a database of alleles."""
refs = Fastafile(expanduser(args.reference))
db = VariantFile(expanduser(args.database))
sample = VariantFile(expanduser(args.sample))
try:
sample_name = sample.header.samples[args.name]
except TypeError:
sample_name = args.name
if db.index is None:
raise ValueError('database file must be indexed')
if sample.index is None:
raise ValueError('sample file must be indexed')
# Open tabluar output file, if requested
table = None
if args.table:
tablefile = open(args.table, 'w') if args.table != '-' else sys.stdout
table = csv.writer(tablefile, delimiter='\t', lineterminator='\n')
write_table_header(table)
update_info_header(sample.header)
with VariantFile(args.output, 'w', header=sample.header) as out:
for superlocus, matches in generate_matches(refs, sample, db, args):
clear_info_fields(superlocus)
for allele_locus, allele, match in matches:
dbvar = allele.record
var_id = dbvar.id or f'{dbvar.chrom}_{dbvar.start+1}_{dbvar.stop}_{dbvar.alts[0]}'
status, times = translate_match(match)
for locus in allele_locus:
info = locus.record.info
info[status] = info.get(status, ()) + (var_id, ) * times
write_table_row(table, sample_name, var_id, allele_locus, status, match)
for locus in sorted(superlocus, key=NormalizedLocus.record_order_key):
out.write(locus.record)
示例14: match_replicates
# 需要导入模块: import pysam [as 别名]
# 或者: from pysam import Fastafile [as 别名]
def match_replicates(args):
"""Match a genome against another presumably identical genome (i.e. replicates)."""
refs = Fastafile(expanduser(args.reference))
in_vars = [VariantFile(var) for var in [args.vcf1, args.vcf2]]
out_vars = make_outputs(in_vars, args.out1, args.out2)
match_status_map = {True: '=', False: 'X', None: '.'}
# Create parallel locus iterator by chromosome
for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args):
# Create superloci by taking the union of overlapping loci across all of the locus streams
loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci]
superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop'))
# Proceed by superlocus
for _, _, (super1, super2) in superloci:
super1.sort(key=NormalizedLocus.natural_order_key)
super2.sort(key=NormalizedLocus.natural_order_key)
super_start, super_stop = get_superlocus_bounds([super1, super2])
print('-' * 80)
print(f'{chrom}:[{super_start:d}-{super_stop:d}):')
print()
for i, superlocus in enumerate([super1, super2], 1):
for locus in superlocus:
lstart = locus.start
lstop = locus.stop
lref = locus.ref or '-'
indices = locus.allele_indices
sep = '|' if locus.phased else '/'
geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices)
print(f' NORM{i:d}: [{lstart:5d}-{lstop:5d}) ref={lref} geno={geno}')
print()
match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug)
match_status = match_status_map[match]
print(f' MATCH={match_status} TYPE={match_type}')
print()
write_match(out_vars[0], super1, args.name1, match_status, match_type)
write_match(out_vars[1], super2, args.name2, match_status, match_type)
for i, superlocus in enumerate([super1, super2], 1):
for locus in superlocus:
print(f' VCF{i:d}: {locus.record}', end='')
print()
for out_var in out_vars:
if out_var is not None:
out_var.close()