本文整理汇总了Python中pyfastaq.utils.open_file_write函数的典型用法代码示例。如果您正苦于以下问题:Python open_file_write函数的具体用法?Python open_file_write怎么用?Python open_file_write使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了open_file_write函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: filter
def filter(
infile,
outfile,
minlength=0,
maxlength=float('inf'),
regex=None,
ids_file=None,
invert=False,
mate_in=None,
mate_out=None,
both_mates_pass=True,
):
ids_from_file = set()
if ids_file is not None:
f = utils.open_file_read(ids_file)
for line in f:
ids_from_file.add(line.rstrip())
utils.close(f)
if mate_in:
if mate_out is None:
raise Error('Error in filter! mate_in provided. Must also provide mate_out')
seq_reader_mate = sequences.file_reader(mate_in)
f_out_mate = utils.open_file_write(mate_out)
seq_reader = sequences.file_reader(infile)
f_out = utils.open_file_write(outfile)
if regex is not None:
r = re.compile(regex)
def passes(seq):
return minlength <= len(seq) <= maxlength \
and (regex is None or r.search(seq.id) is not None) \
and (ids_file is None or seq.id in ids_from_file)
for seq in seq_reader:
seq_passes = passes(seq)
if mate_in:
try:
seq_mate = next(seq_reader_mate)
except:
utils.close(f_out)
raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue')
mate_passes = passes(seq_mate)
want_the_pair = (seq_passes and mate_passes) \
or (( seq_passes or mate_passes) and not both_mates_pass)
if want_the_pair != invert:
print(seq, file=f_out)
print(seq_mate, file=f_out_mate)
elif seq_passes != invert:
print(seq, file=f_out)
utils.close(f_out)
if mate_in:
utils.close(f_out_mate)
示例2: split_by_fixed_size
def split_by_fixed_size(infile, outfiles_prefix, chunk_size, tolerance, skip_if_all_Ns=False):
'''Splits fasta/q file into separate files, with up to (chunk_size + tolerance) bases in each file'''
file_count = 1
coords = []
small_sequences = [] # sequences shorter than chunk_size
seq_reader = sequences.file_reader(infile)
f_coords = utils.open_file_write(outfiles_prefix + '.coords')
for seq in seq_reader:
if skip_if_all_Ns and seq.is_all_Ns():
continue
if len(seq) < chunk_size:
small_sequences.append(copy.copy(seq))
elif len(seq) <= chunk_size + tolerance:
f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
print(seq, file=f)
utils.close(f)
file_count += 1
else:
# make list of chunk coords
chunks = [(x,x+chunk_size) for x in range(0, len(seq), chunk_size)]
if chunks[-1][1] - 1 > len(seq):
chunks[-1] = (chunks[-1][0], len(seq))
if len(chunks) > 1 and (chunks[-1][1] - chunks[-1][0]) <= tolerance:
chunks[-2] = (chunks[-2][0], chunks[-1][1])
chunks.pop()
# write one output file per chunk
offset = 0
for chunk in chunks:
if not(skip_if_all_Ns and seq.is_all_Ns(start=chunk[0], end=chunk[1]-1)):
f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
chunk_id = seq.id + ':' + str(chunk[0]+1) + '-' + str(chunk[1])
print(sequences.Fasta(chunk_id, seq[chunk[0]:chunk[1]]), file=f)
print(chunk_id, seq.id, offset, sep='\t', file=f_coords)
utils.close(f)
file_count += 1
offset += chunk[1] - chunk[0]
# write files of small sequences
if len(small_sequences):
f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
file_count += 1
base_count = 0
for seq in small_sequences:
if base_count > 0 and base_count + len(seq) > chunk_size + tolerance:
utils.close(f)
f = utils.open_file_write(outfiles_prefix + '.' + str(file_count))
file_count += 1
base_count = 0
print(seq, file=f)
base_count += len(seq)
utils.close(f)
示例3: test_raise_exception
def test_raise_exception(self):
'''open_file_write() and open_file_read() should raise an exception when can't do the opening'''
with self.assertRaises(utils.Error):
utils.open_file_read('this_file_is_not_here_so_throw_error')
with self.assertRaises(utils.Error):
utils.open_file_read('this_file_is_not_here_so_throw_error.gz')
with self.assertRaises(utils.Error):
utils.open_file_read(os.path.join(data_dir, 'utils_test_not_really_zipped.gz'))
with self.assertRaises(utils.Error):
utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error'))
with self.assertRaises(utils.Error):
utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
示例4: interleave
def interleave(infile_1, infile_2, outfile):
seq_reader_1 = sequences.file_reader(infile_1)
seq_reader_2 = sequences.file_reader(infile_2)
f_out = utils.open_file_write(outfile)
for seq_1 in seq_reader_1:
try:
seq_2 = next(seq_reader_2)
except:
utils.close(f_out)
raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue')
print(seq_1, file=f_out)
print(seq_2, file=f_out)
try:
seq_2 = next(seq_reader_2)
except:
seq_2 = None
if seq_2 is not None:
utils.close(f_out)
raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue')
utils.close(f_out)
示例5: trim_contigs
def trim_contigs(infile, outfile, trim):
seq_reader = sequences.file_reader(infile)
fout = utils.open_file_write(outfile)
for seq in seq_reader:
if len(seq) < 2 * trim:
continue
gaps = seq.gaps()
bases = list(seq.seq)
# extend the length of each gap
for gap in gaps:
left_start = max(gap.start - trim, 0)
right_end = min(gap.end + trim + 1, len(seq))
for i in range(left_start, gap.start):
bases[i] = 'N'
for i in range(gap.end, right_end):
bases[i] = 'N'
seq.seq = ''.join(bases)
# trim start/end bases and tidy up any resulting Ns at either end of the trimmed seq
seq.trim(trim, trim)
seq.trim_Ns()
# check that there is some non-N sequence left over
regex = re.compile('[^nN]')
if regex.search(seq.seq) is not None:
print(seq, file=fout)
utils.close(fout)
示例6: run
def run(description):
parser = argparse.ArgumentParser(
description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' +
'from a mates file. Output is interleaved if mates file given',
usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>')
parser.add_argument('--mate_file', help='Name of mates file')
parser.add_argument('--seed', help='Seed for random number generator. If not given, python\'s default is used', metavar='INT')
parser.add_argument('infile', help='Name of input file')
parser.add_argument('outfile', help='Name of output file')
parser.add_argument('percent', type=float, help='Per cent probability of keeping any given read (pair) in [0,100]', metavar='FLOAT')
options = parser.parse_args()
random.seed(a=options.seed)
seq_reader = sequences.file_reader(options.infile)
fout = utils.open_file_write(options.outfile)
if options.mate_file:
mate_seq_reader = sequences.file_reader(options.mate_file)
for seq in seq_reader:
if options.mate_file:
try:
mate_seq = next(mate_seq_reader)
except StopIteration:
print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr)
sys.exit(1)
if 100 * random.random() <= options.percent:
print(seq, file=fout)
if options.mate_file:
print(mate_seq, file=fout)
utils.close(fout)
示例7: to_fasta
def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False, check_unique=False):
seq_reader = sequences.file_reader(infile)
f_out = utils.open_file_write(outfile)
original_line_length = sequences.Fasta.line_length
sequences.Fasta.line_length = line_length
if check_unique:
used_names = {}
for seq in seq_reader:
if strip_after_first_whitespace:
seq.strip_after_first_whitespace()
if check_unique:
used_names[seq.id] = used_names.get(seq.id, 0) + 1
if type(seq) == sequences.Fastq:
print(sequences.Fasta(seq.id, seq.seq), file=f_out)
else:
print(seq, file=f_out)
utils.close(f_out)
sequences.Fasta.line_length = original_line_length
if check_unique:
all_unique = True
for name, count in used_names.items():
if count > 1:
print('Sequence name "' + name + '" not unique. Found', count, 'times', file=sys.stderr)
all_unique = False
if not all_unique:
raise Error('Not all sequence names unique. Cannot continue')
示例8: acgtn_only
def acgtn_only(infile, outfile):
'''Replace every non-acgtn (case insensitve) character with an N'''
f = utils.open_file_write(outfile)
for seq in sequences.file_reader(infile):
seq.replace_non_acgt()
print(seq, file=f)
utils.close(f)
示例9: interleave
def interleave(infile_1, infile_2, outfile, suffix1=None, suffix2=None):
'''Makes interleaved file from two sequence files. If used, will append suffix1 onto end
of every sequence name in infile_1, unless it already ends with suffix1. Similar for sufffix2.'''
seq_reader_1 = sequences.file_reader(infile_1)
seq_reader_2 = sequences.file_reader(infile_2)
f_out = utils.open_file_write(outfile)
for seq_1 in seq_reader_1:
try:
seq_2 = next(seq_reader_2)
except:
utils.close(f_out)
raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue')
if suffix1 is not None and not seq_1.id.endswith(suffix1):
seq_1.id += suffix1
if suffix2 is not None and not seq_2.id.endswith(suffix2):
seq_2.id += suffix2
print(seq_1, file=f_out)
print(seq_2, file=f_out)
try:
seq_2 = next(seq_reader_2)
except:
seq_2 = None
if seq_2 is not None:
utils.close(f_out)
raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue')
utils.close(f_out)
示例10: fix_blast_coords
def fix_blast_coords(blast_file, coords_file, outfile):
coords_offset = offset_coords_file_to_dict(coords_file)
fin = utils.open_file_read(blast_file)
fout = utils.open_file_write(outfile)
for line in fin:
# blastn sticks a bunch of header lines in the tabulated
# output file. Need to ignore them
if '\t' not in line:
continue
# Lines are supposed to be tab delimited. Sometimes they
# have a space character following a tab character, so
# split on whitespace. This is OK because the pipeline has already
# removed whitespace from sequence names
data = line.rstrip().split()
if data[0] in coords_offset:
data[6] = str(int(data[6]) + coords_offset[data[0]][1])
data[7] = str(int(data[7]) + coords_offset[data[0]][1])
data[0] = coords_offset[data[0]][0]
# always reconstruct the line, because of spaces bug mentioned above
line = '\t'.join(data)
print(line.rstrip(),file=fout)
utils.close(fin)
utils.close(fout)
示例11: translate
def translate(infile, outfile, frame=0):
seq_reader = sequences.file_reader(infile)
fout = utils.open_file_write(outfile)
for seq in seq_reader:
print(seq.translate(frame=frame), file=fout)
utils.close(fout)
示例12: reverse_complement
def reverse_complement(infile, outfile):
seq_reader = sequences.file_reader(infile)
fout = utils.open_file_write(outfile)
for seq in seq_reader:
seq.revcomp()
print(seq, file=fout)
utils.close(fout)
示例13: replace_bases
def replace_bases(infile, outfile, old, new):
seq_reader = sequences.file_reader(infile)
f_out = utils.open_file_write(outfile)
for seq in seq_reader:
seq.replace_bases(old, new)
print(seq, file=f_out)
utils.close(f_out)
示例14: strip_illumina_suffix
def strip_illumina_suffix(infile, outfile):
seq_reader = sequences.file_reader(infile)
f_out = utils.open_file_write(outfile)
for seq in seq_reader:
seq.strip_illumina_suffix()
print(seq, file=f_out)
utils.close(f_out)
示例15: capillary_to_pairs
def capillary_to_pairs(infile, outprefix):
# hash the sequences, only taking longest where an end has been sequenced more than once
seq_reader = sequences.file_reader(infile)
fwd_seqs = {}
rev_seqs = {}
unpaired_seqs = {}
for seq in seq_reader:
id_info = seq.split_capillary_id()
if id_info['dir'] == 'fwd':
seq.id = id_info['prefix'] + '/1'
h = fwd_seqs
elif id_info['dir'] == 'rev':
seq.id = id_info['prefix'] + '/2'
h = rev_seqs
else:
seq.id = id_info['prefix']
h = unpaired_seqs
key = id_info['prefix']
if key not in h or len(h[key]) < len(seq):
h[key] = copy.copy(seq)
# write the output files
f_pe = utils.open_file_write(outprefix + '.paired.gz')
f_up = utils.open_file_write(outprefix + '.unpaired.gz')
for id in fwd_seqs:
if id in rev_seqs:
print(fwd_seqs[id], file=f_pe)
print(rev_seqs[id], file=f_pe)
del rev_seqs[id]
else:
print(fwd_seqs[id], file=f_up)
for seq in rev_seqs.values():
print(seq, file=f_up)
for seq in unpaired_seqs.values():
print(seq, file=f_up)
utils.close(f_pe)
utils.close(f_up)