本文整理汇总了Python中cogent.DNA.rc方法的典型用法代码示例。如果您正苦于以下问题:Python DNA.rc方法的具体用法?Python DNA.rc怎么用?Python DNA.rc使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类cogent.DNA
的用法示例。
在下文中一共展示了DNA.rc方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process_uclust_pw_alignment_results
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_uclust_pw_alignment_results(fasta_pairs_lines,uc_lines):
""" Process results of uclust search and align """
alignments = get_next_two_fasta_records(fasta_pairs_lines)
for hit in get_next_record_type(uc_lines,'H'):
matching_strand = hit[4]
if matching_strand == '-':
strand_id = '-'
target_rev_match = True
elif matching_strand == '+':
strand_id = '+'
target_rev_match = False
elif matching_strand == '.':
# protein sequence, so no strand information
strand_id = ''
target_rev_match = False
else:
raise UclustParseError, "Unknown strand type: %s" % matching_strand
uc_query_id = hit[8]
uc_target_id = hit[9]
percent_id = float(hit[3])
fasta_pair = alignments.next()
fasta_query_id = fasta_pair[0][0]
aligned_query = fasta_pair[0][1]
if fasta_query_id != uc_query_id:
raise UclustParseError,\
"Order of fasta and uc files do not match."+\
" Got query %s but expected %s." %\
(fasta_query_id, uc_query_id)
fasta_target_id = fasta_pair[1][0]
aligned_target = fasta_pair[1][1]
if fasta_target_id != uc_target_id + strand_id:
raise UclustParseError, \
"Order of fasta and uc files do not match."+\
" Got target %s but expected %s." %\
(fasta_target_id, uc_target_id + strand_id)
if target_rev_match:
query_id = uc_query_id + ' RC'
aligned_query = DNA.rc(aligned_query)
target_id = uc_target_id
aligned_target = DNA.rc(aligned_target)
else:
query_id = uc_query_id
aligned_query = aligned_query
target_id = uc_target_id
aligned_target = aligned_target
yield (query_id, target_id, aligned_query, aligned_target, percent_id)
示例2: parse_illumina_line
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def parse_illumina_line(l, barcode_length, rev_comp_barcode,
barcode_in_sequence=False):
"""Parses a single line of Illumina data
"""
fields = l.strip().split(':')
y_position_subfields = fields[4].split('#')
y_position = int(y_position_subfields[0])
sequence = fields[5]
qual_string = fields[6]
if barcode_in_sequence:
barcode = sequence[:barcode_length]
sequence = sequence[barcode_length:]
qual_string = qual_string[barcode_length:]
else:
barcode = y_position_subfields[1][:barcode_length]
if rev_comp_barcode:
barcode = DNA.rc(barcode)
result = {
'Full description': ':'.join(fields[:5]),
'Machine Name': fields[0],
'Channel Number': int(fields[1]),
'Tile Number': int(fields[2]),
'X Position': int(fields[3]),
'Y Position': y_position,
'Barcode': barcode,
'Full Y Position Field': fields[4],
'Sequence': sequence,
'Quality Score': qual_string}
return result
示例3: process_barcode_single_end_data
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len=6, rev_comp_bc1=False):
""" Processes, writes single-end barcode data, parsed sequence
read1_data: list of header, read, quality scores
output_bc_fastq: open output fastq filepath
output_fastq1: open output fastq reads filepath
bc1_len: length of barcode to remove from beginning of data
rev_comp_bc1: reverse complement barcode before writing.
"""
header_index = 0
sequence_index = 1
quality_index = 2
bc_read = read1_data[sequence_index][:bc1_len]
bc_qual = read1_data[quality_index][:bc1_len]
if rev_comp_bc1:
bc_read = DNA.rc(bc_read)
bc_qual = bc_qual[::-1]
bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual)
output_bc_fastq.write(bc_lines)
seq_lines = format_fastq_record(
read1_data[header_index], read1_data[sequence_index][bc1_len:], read1_data[quality_index][bc1_len:]
)
output_fastq1.write(seq_lines)
return
示例4: parse_illumina_single_end_read_file
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def parse_illumina_single_end_read_file(read_file,barcode_length,\
max_bad_run_length,quality_threshold,min_per_read_length,
rev_comp,rev_comp_barcode,barcode_in_seq,barcode_max_N=0,seq_max_N=0):
"""Parses Illumina single-end read file
"""
for read_line in read_file:
read = parse_illumina_line(read_line,barcode_length,
rev_comp_barcode,barcode_in_seq)
read_desc = illumina_read_description_from_read_data(read)
read_barcode = read['Barcode']
if read_barcode.count('N') > barcode_max_N:
continue
seq, qual = read_qual_score_filter(\
read['Sequence'], read['Quality Score'],\
max_bad_run_length, quality_threshold)
if (len(seq) < min_per_read_length) or (seq.count('N') > seq_max_N):
continue
if rev_comp:
seq = DNA.rc(seq)
qual = qual[::-1]
yield read_desc, read_barcode, seq, qual
示例5: rc_fasta_lines
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def rc_fasta_lines(fasta_lines, seq_desc_mapper=append_rc):
"""
"""
for seq_id, seq in parse_fasta(fasta_lines):
seq_id = seq_desc_mapper(seq_id)
seq = DNA.rc(seq.upper())
yield seq_id, seq
return
示例6: get_rev_primer_seqs
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def get_rev_primer_seqs(mapping_fp):
""" Parses mapping file to get dictionary of SampleID:Rev primer
mapping_fp: mapping filepath
"""
hds, mapping_data, run_description, errors, warnings = \
process_id_map(mapping_fp, has_barcodes=False,
disable_primer_check=True)
if errors:
for curr_err in errors:
if curr_err.startswith("Duplicate SampleID"):
raise ValueError,('Errors were found with mapping file, '+\
'please run check_id_map.py to identify problems.')
# create dict of dicts with SampleID:{each header:mapping data}
id_map = {}
for curr_data in mapping_data:
id_map[curr_data[0]] = {}
for header in range(len(hds)):
for curr_data in mapping_data:
id_map[curr_data[0]][hds[header]] = curr_data[header]
reverse_primers = {}
for curr_id in id_map.keys():
try:
reverse_primers[curr_id] =\
[DNA.rc(curr_rev_primer) for curr_rev_primer in\
id_map[curr_id]['ReversePrimer'].split(',')]
except KeyError:
raise KeyError,("Reverse primer not found in mapping file, "+\
"please include a 'ReversePrimer' column.")
# Check for valid reverse primers
# Will have been detected as warnings from mapping file
for curr_err in errors:
if curr_err.startswith("Invalid DNA sequence detected"):
raise ValueError,("Problems found with reverse primers, please "+\
"check mapping file with check_id_map.py")
return reverse_primers
示例7: parse_illumina_paired_end_read_files
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def parse_illumina_paired_end_read_files(read1_file,read2_file,barcode_length,\
max_bad_run_length,quality_threshold,min_per_read_length,rev_comp_barcode,\
barcode_max_N=0,seq_max_N=0):
"""Parses Illumina paired-end read file pair
"""
for read1_line, read2_line in izip(read1_file,read2_file):
read1 = parse_illumina_line(read1_line,barcode_length,rev_comp_barcode)
read2 = parse_illumina_line(read2_line,barcode_length,rev_comp_barcode)
read1_desc = illumina_read_description_from_read_data(read1)
read2_desc = illumina_read_description_from_read_data(read2)
read1_barcode = read1['Barcode']
read2_barcode = read2['Barcode']
if (read1_barcode.count('N') > barcode_max_N) or \
(read2_barcode.count('N') > barcode_max_N):
continue
if read1_desc != read2_desc:
raise IlluminaParseError, \
"Error in sequence files, descriptions of"+\
" corresponding lines are not compatible: %s != %s" %\
(read1_desc, read2_desc)
assert read1_barcode == read2_barcode
seq1, qual1 = read_qual_score_filter(\
read1['Sequence'], read1['Quality Score'],\
max_bad_run_length, quality_threshold)
if (len(seq1) < min_per_read_length):
continue
seq2, qual2 = read_qual_score_filter(\
read2['Sequence'], read2['Quality Score'],\
max_bad_run_length, quality_threshold)
if (len(seq2) < min_per_read_length):
continue
seq = seq1 + DNA.rc(seq2)
# If the total number of Ns is more than the max
# allowed ignore this sequence
if seq.count('N') > seq_max_N:
continue
qual = qual1 + qual2[::-1]
yield read1_desc, read1_barcode, seq, qual
示例8: process_fastq_single_end_read_file
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_fastq_single_end_read_file(fastq_read_f,
fastq_barcode_f,
barcode_to_sample_id,
store_unassigned=False,
max_bad_run_length=0,
phred_quality_threshold=2,
min_per_read_length_fraction=0.75,
rev_comp=False,
rev_comp_barcode=False,
seq_max_N=0,
start_seq_id=0,
filter_bad_illumina_qual_digit=False,
log_f=None,
histogram_f=None,
barcode_correction_fn=None,
max_barcode_errors=1.5,
strict_header_match=True,
phred_to_ascii_f=None):
"""parses fastq single-end read file
"""
header_index = 0
sequence_index = 1
quality_index = 2
seq_id = start_seq_id
# grab the first lines and then seek back to the beginning of the file
try:
fastq_read_f_line1 = fastq_read_f.readline()
fastq_read_f_line2 = fastq_read_f.readline()
fastq_read_f.seek(0)
except AttributeError:
fastq_read_f_line1 = fastq_read_f[0]
fastq_read_f_line2 = fastq_read_f[1]
# determine the version of casava that was used to generate the fastq
# to determine how to compare header lines and decode ascii phred scores
post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1)
if post_casava_v180:
check_header_match_f = check_header_match_180_or_later
if phred_to_ascii_f == None:
phred_to_ascii_f = phred_to_ascii33
else:
check_header_match_f = check_header_match_pre180
if phred_to_ascii_f == None:
phred_to_ascii_f = phred_to_ascii64
# determine the last unacceptable quality character
if phred_quality_threshold != None:
last_bad_quality_char = phred_to_ascii_f(phred_quality_threshold)
else:
# disable quality filter
last_bad_quality_char = ''
# compute the barcode length, if they are all the same.
# this is useful for selecting a subset of the barcode read
# if it's too long (e.g., for technical reasons on the sequencer)
barcode_lengths = set([len(bc) for bc, sid in barcode_to_sample_id.items()])
if len(barcode_lengths) == 1:
barcode_length = barcode_lengths.pop()
else:
barcode_length = None
# compute the minimum read length as a fraction of the length of the input read
min_per_read_length = min_per_read_length_fraction * len(fastq_read_f_line2)
# prep data for logging
input_sequence_count = 0
count_barcode_not_in_map = 0
count_too_short = 0
count_too_many_N = 0
count_bad_illumina_qual_digit = 0
count_barcode_errors_exceed_max = 0
sequence_lengths = []
seqs_per_sample_counts = {}
for bc_data,read_data in izip(MinimalFastqParser(fastq_barcode_f,strict=False),
MinimalFastqParser(fastq_read_f,strict=False)):
input_sequence_count += 1
# Confirm match between barcode and read headers
if strict_header_match and \
(not check_header_match_f(bc_data[header_index],read_data[header_index])):
raise FastqParseError,\
("Headers of barcode and read do not match. Can't continue. "
"Confirm that the barcode fastq and read fastq that you are "
"passing match one another.")
else:
header = read_data[header_index]
# Grab the barcode sequence
if barcode_length:
# because thirteen cycles are sometimes used for
# techical reasons, this step looks only at the
# first tweleve bases. note that the barcode is
# rev-comp'ed after this step if requested since
# the thirteen base is a technical artefact, not
# barcode sequence.
barcode = bc_data[sequence_index][:barcode_length]
else:
barcode = bc_data[sequence_index]
if rev_comp_barcode:
barcode = DNA.rc(barcode)
#.........这里部分代码省略.........
示例9: main
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def main():
option_parser, opts, args =\
parse_command_line_parameters(**script_info)
sequence_read_fps = opts.sequence_read_fps
barcode_read_fps = opts.barcode_read_fps
sample_id = opts.sample_id
mapping_fps = opts.mapping_fps
phred_quality_threshold = opts.phred_quality_threshold
retain_unassigned_reads = opts.retain_unassigned_reads
min_per_read_length_fraction = opts.min_per_read_length_fraction
max_bad_run_length = opts.max_bad_run_length
rev_comp = opts.rev_comp
rev_comp_barcode = opts.rev_comp_barcode
rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
seq_max_N = opts.sequence_max_n
start_seq_id = opts.start_seq_id
# NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit
store_qual_scores = opts.store_qual_scores
store_demultiplexed_fastq = opts.store_demultiplexed_fastq
barcode_type = opts.barcode_type
max_barcode_errors = opts.max_barcode_errors
# if this is not a demultiplexed run,
if barcode_type == 'not-barcoded':
if sample_id == None:
option_parser.error("If not providing barcode reads (because "
"your data is not multiplexed), must provide a --sample_id.")
barcode_read_fps = [None] * len(sequence_read_fps)
elif barcode_read_fps == None:
option_parser.error("Must provide --barcode_fps if "
"--barcode_type is not 'not-barcoded'")
else:
pass
phred_offset = opts.phred_offset
if phred_offset != None:
try:
phred_to_ascii_f = phred_to_ascii_fs[phred_offset]
except KeyError:
# shouldn't be able to get here, but we'll stay on the
# safe side
opption_parser.error(\
"Only valid phred offsets are: %s" %\
' '.join(phred_to_ascii_fs.keys()))
else:
# let split_libraries_fastq.process_fastq_single_end_read_file
# figure it out...
phred_to_ascii_f = None
if opts.last_bad_quality_char != None:
option_parser.error('--last_bad_quality_char is no longer supported. '
'Use -q instead (see option help text by passing -h)')
if not (0 <= min_per_read_length_fraction <= 1):
option_parser.error('--min_per_read_length_fraction must be between '
'0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction)
try:
barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type]
except KeyError:
barcode_correction_fn = None
if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
mapping_fps = mapping_fps * len(sequence_read_fps)
if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1:
option_parser.error("Same number of sequence, barcode and mapping files must be provided.")
output_dir = opts.output_dir
create_dir(output_dir)
output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
output_fp = '%s/seqs.fna' % output_dir
output_f = open(output_fp_temp,'w')
qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
qual_fp = '%s/seqs.qual' % output_dir
output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
output_fastq_fp = '%s/seqs.fastq' % output_dir
if store_qual_scores:
qual_f = open(qual_fp_temp,'w')
# define a qual writer whether we're storing
# qual strings or not so we don't have to check
# every time through the for loop below
def qual_writer(h,q):
qual_f.write('>%s\n%s\n' % (h,q))
else:
def qual_writer(h,q):
pass
if store_demultiplexed_fastq:
output_fastq_f = open(output_fastq_fp_temp,'w')
# define a fastq writer whether we're storing
# qual strings or not so we don't have to check
# every time through the for loop below
def fastq_writer(h,s,q):
output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h,s,q))
else:
#.........这里部分代码省略.........
示例10: get_primers
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def get_primers(header, mapping_data):
""" Returns lists of forward/reverse primer regular expression generators
header: list of strings of header data.
mapping_data: list of lists of mapping data
Will raise error if either the LinkerPrimerSequence or ReversePrimer fields
are not present
"""
if "LinkerPrimerSequence" in header:
primer_ix = header.index("LinkerPrimerSequence")
else:
raise IndexError, ("Mapping file is missing LinkerPrimerSequence field.")
if "ReversePrimer" in header:
rev_primer_ix = header.index("ReversePrimer")
else:
raise IndexError, ("Mapping file is missing ReversePrimer field.")
iupac = {
"A": "A",
"T": "T",
"G": "G",
"C": "C",
"R": "[AG]",
"Y": "[CT]",
"S": "[GC]",
"W": "[AT]",
"K": "[GT]",
"M": "[AC]",
"B": "[CGT]",
"D": "[AGT]",
"H": "[ACT]",
"V": "[ACG]",
"N": "[ACGT]",
}
raw_forward_primers = set([])
raw_forward_rc_primers = set([])
raw_reverse_primers = set([])
raw_reverse_rc_primers = set([])
for line in mapping_data:
# Split on commas to handle pool of primers
raw_forward_primers.update([upper(primer).strip() for primer in line[primer_ix].split(",")])
raw_forward_rc_primers.update([DNA.rc(primer) for primer in raw_forward_primers])
raw_reverse_primers.update([upper(primer).strip() for primer in line[rev_primer_ix].split(",")])
raw_reverse_rc_primers.update([DNA.rc(primer) for primer in raw_reverse_primers])
if not raw_forward_primers:
raise ValueError, ("No forward primers detected in mapping file.")
if not raw_reverse_primers:
raise ValueError, ("No reverse primers detected in mapping file.")
# Finding the forward primers, or rc of reverse primers indicates forward
# read. Finding the reverse primer, or rc of the forward primers, indicates
# the reverse read, so these sets are merged.
raw_forward_primers.update(raw_reverse_rc_primers)
raw_reverse_primers.update(raw_forward_rc_primers)
forward_primers = []
reverse_primers = []
for curr_primer in raw_forward_primers:
forward_primers.append(compile("".join([iupac[symbol] for symbol in curr_primer])))
for curr_primer in raw_reverse_primers:
reverse_primers.append(compile("".join([iupac[symbol] for symbol in curr_primer])))
return forward_primers, reverse_primers
示例11: process_barcode_in_label
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_barcode_in_label(
read1_data,
read2_data,
output_bc_fastq,
bc1_len=6,
bc2_len=6,
rev_comp_bc1=False,
rev_comp_bc2=False,
char_delineator=":",
):
""" Reads data from one or two fastq labels, writes output barcodes file.
read1_data: list of header, read, quality scores
read2_data: list of header, read, quality scores, False if no read 2.
output_bc_fastq: open output fastq filepath
bc1_len: length of barcode to remove from beginning of read1 data
bc2_len: length of barcode to remove from beginning of read2 data
rev_comp_bc1: reverse complement barcode 1 before writing.
rev_comp_bc2: reverse complement barcode 2 before writing.
char_delineator: Specify character that immediately precedes the barcode
for input_type of barcode_in_label.
"""
header_index = 0
sequence_index = 1
quality_index = 2
# Check for char_delineator in sequence
try:
bc1_read = read1_data[header_index].split(char_delineator)[-1][0:bc1_len]
# If there is an index error, it means the char_delineator wasn't found
except IndexError:
raise IndexError, (
"Found sequence lacking character delineator. "
"Sequence header %s, character delineator %s" % (read1_data[header_index], char_delineator)
)
# Create fake quality scores
bc1_qual = "F" * len(bc1_read)
if rev_comp_bc1:
bc1_read = DNA.rc(bc1_read)
if read2_data:
bc2_read = read2_data[header_index].strip().split(char_delineator)[-1][0:bc2_len]
bc2_qual = "F" * len(bc2_read)
if rev_comp_bc2:
bc2_read = DNA.rc(bc2_read)
else:
bc2_read = ""
bc2_qual = ""
if not bc1_read and not bc2_read:
raise ValueError, (
"Came up with empty barcode sequence, please check "
"character delineator with -s, and fastq label "
"%s" % read1_data[header_index]
)
bc_lines = format_fastq_record(read1_data[header_index], bc1_read + bc2_read, bc1_qual + bc2_qual)
output_bc_fastq.write(bc_lines)
return
示例12: process_barcode_paired_stitched
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_barcode_paired_stitched(
read_data,
output_bc_fastq,
output_fastq,
bc1_len=6,
bc2_len=6,
rev_comp_bc1=False,
rev_comp_bc2=False,
attempt_read_orientation=False,
forward_primers=None,
reverse_primers=None,
output_bc_not_oriented=None,
fastq_out_not_oriented=None,
switch_bc_order=False,
):
""" Processes stitched barcoded reads, writes barcode, parsed stitched read
read_data: list of header, read, quality scores
output_bc_fastq: open output fastq filepath
output_fastq: open output fastq reads filepath
bc1_len: length of barcode to remove from beginning of read1 stitched data
bc2_len: length of barcode to remove from end of read2 stitched data
rev_comp_bc1: reverse complement barcode 1 before writing.
rev_comp_bc2: reverse complement barcode 2 before writing.
attempt_read_orientation: If True, will attempt to orient the reads
according to the forward primers in the mapping file. If primer is
detected in current orientation, leave the read as is, but if reverse
complement is detected (or ReversePrimer is detected in the current
orientation) the read will either be written to the forward (read 1) or
reverse (read 2) reads for the case of paired files, or the read will be
reverse complemented in the case of stitched reads.
forward_primers: list of regular expression generators, forward primers
reverse_primers: list of regular expression generators, reverse primers
output_bc_not_oriented: Barcode output from reads that are not oriented
fastq_out_not_oriented: Open filepath to write reads where primers
can't be found when attempt_read_orientation is True.
switch_bc_order: Normally, barcode 1 will be written first, followed by
barcode 2 in a combined output fastq file. If True, the order will be
reversed. Only applies to stitched reads processing, as other barcode
orders are dictated by the the parameter chosen for the fastq files.
"""
header_index = 0
sequence_index = 1
quality_index = 2
read_seq = read_data[sequence_index]
read_qual = read_data[quality_index]
found_primer_match = False
# Break from orientation search as soon as a match is found
if attempt_read_orientation:
for curr_primer in forward_primers:
if curr_primer.search(read_data[sequence_index]):
found_primer_match = True
break
if not found_primer_match:
for curr_primer in reverse_primers:
if curr_primer.search(read_data[sequence_index]):
read_seq = DNA.rc(read_seq)
read_qual = read_qual[::-1]
found_primer_match = True
break
if not found_primer_match and attempt_read_orientation:
output_bc = output_bc_not_oriented
output_read = fastq_out_not_oriented
else:
output_bc = output_bc_fastq
output_read = output_fastq
bc_read1 = read_seq[0:bc1_len]
bc_read2 = read_seq[-bc2_len:]
bc_qual1 = read_qual[0:bc1_len]
bc_qual2 = read_qual[-bc2_len:]
if rev_comp_bc1:
bc_read1 = DNA.rc(bc_read1)
bc_qual1 = bc_qual1[::-1]
if rev_comp_bc2:
bc_read2 = DNA.rc(bc_read2)
bc_qual2 = bc_qual2[::-1]
if switch_bc_order:
bc_read1, bc_read2 = bc_read2, bc_read1
bc_qual1, bc_qual2 = bc_qual2, bc_qual1
bc_lines = format_fastq_record(read_data[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2)
output_bc.write(bc_lines)
seq_lines = format_fastq_record(read_data[header_index], read_seq[bc1_len:-bc2_len], read_qual[bc1_len:-bc2_len])
output_read.write(seq_lines)
return
示例13: process_barcode_paired_end_data
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_barcode_paired_end_data(
read1_data,
read2_data,
output_bc_fastq,
output_fastq1,
output_fastq2,
bc1_len=6,
bc2_len=6,
rev_comp_bc1=False,
rev_comp_bc2=False,
attempt_read_orientation=False,
forward_primers=None,
reverse_primers=None,
output_bc_not_oriented=None,
fastq1_out_not_oriented=None,
fastq2_out_not_oriented=None,
):
""" Processes, writes paired-end barcode data, parsed sequences
read1_data: list of header, read, quality scores
read2_data: list of header, read, quality scores
output_bc_fastq: open output fastq filepath
output_fastq1: open output fastq reads 1 filepath
output_fastq2: open output fastq reads 2 filepath
bc1_len: length of barcode to remove from beginning of read1 data
bc2_len: length of barcode to remove from beginning of read2 data
rev_comp_bc1: reverse complement barcode 1 before writing.
rev_comp_bc2: reverse complement barcode 2 before writing.
attempt_read_orientation: If True, will attempt to orient the reads
according to the forward primers in the mapping file. If primer is
detected in current orientation, leave the read as is, but if reverse
complement is detected (or ReversePrimer is detected in the current
orientation) the read will either be written to the forward (read 1) or
reverse (read 2) reads for the case of paired files, or the read will be
reverse complemented in the case of stitched reads.
forward_primers: list of regular expression generators, forward primers
reverse_primers: list of regular expression generators, reverse primers
output_bc_not_oriented: Barcode output from reads that are not oriented
fastq1_out_not_oriented: Open filepath to write reads 1 where primers
can't be found when attempt_read_orientation is True.
fastq2_out_not_oriented: Open filepath to write reads 2 where primers
can't be found when attempt_read_orientation is True.
"""
header_index = 0
sequence_index = 1
quality_index = 2
found_primer_match = False
# Break from orientation search as soon as a match is found
if attempt_read_orientation:
# First check forward primers
for curr_primer in forward_primers:
if curr_primer.search(read1_data[sequence_index]):
read1 = read1_data
read2 = read2_data
found_primer_match = True
break
if curr_primer.search(read2_data[sequence_index]):
read1 = read2_data
read2 = read1_data
found_primer_match = True
break
# Check reverse primers if forward primers not found
if not found_primer_match:
for curr_primer in reverse_primers:
if curr_primer.search(read1_data[sequence_index]):
read1 = read2_data
read2 = read1_data
found_primer_match = True
break
if curr_primer.search(read2_data[sequence_index]):
read1 = read1_data
read2 = read2_data
found_primer_match = True
break
else:
read1 = read1_data
read2 = read2_data
if not found_primer_match and attempt_read_orientation:
read1 = read1_data
read2 = read2_data
output_bc = output_bc_not_oriented
output_read1 = fastq1_out_not_oriented
output_read2 = fastq2_out_not_oriented
else:
output_bc = output_bc_fastq
output_read1 = output_fastq1
output_read2 = output_fastq2
bc_read1 = read1[sequence_index][0:bc1_len]
bc_read2 = read2[sequence_index][0:bc2_len]
bc_qual1 = read1[quality_index][0:bc1_len]
bc_qual2 = read2[quality_index][0:bc2_len]
if rev_comp_bc1:
bc_read1 = DNA.rc(bc_read1)
bc_qual1 = bc_qual1[::-1]
if rev_comp_bc2:
bc_read2 = DNA.rc(bc_read2)
#.........这里部分代码省略.........
示例14: get_primers
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def get_primers(header,
mapping_data):
""" Returns lists of forward/reverse primer regular expression generators
header: list of strings of header data.
mapping_data: list of lists of mapping data
Will raise error if either the LinkerPrimerSequence or ReversePrimer fields
are not present
"""
if "LinkerPrimerSequence" in header:
primer_ix = header.index("LinkerPrimerSequence")
else:
raise IndexError(
("Mapping file is missing LinkerPrimerSequence field."))
if "ReversePrimer" in header:
rev_primer_ix = header.index("ReversePrimer")
else:
raise IndexError(("Mapping file is missing ReversePrimer field."))
iupac = {'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'R': '[AG]', 'Y': '[CT]',
'S': '[GC]', 'W': '[AT]', 'K': '[GT]', 'M': '[AC]', 'B': '[CGT]',
'D': '[AGT]', 'H': '[ACT]', 'V': '[ACG]', 'N': '[ACGT]'}
raw_forward_primers = set([])
raw_forward_rc_primers = set([])
raw_reverse_primers = set([])
raw_reverse_rc_primers = set([])
for line in mapping_data:
# Split on commas to handle pool of primers
raw_forward_primers.update([upper(primer).strip() for
primer in line[primer_ix].split(',')])
raw_forward_rc_primers.update([DNA.rc(primer) for
primer in raw_forward_primers])
raw_reverse_primers.update([upper(primer).strip() for
primer in line[rev_primer_ix].split(',')])
raw_reverse_rc_primers.update([DNA.rc(primer) for
primer in raw_reverse_primers])
if not raw_forward_primers:
raise ValueError(("No forward primers detected in mapping file."))
if not raw_reverse_primers:
raise ValueError(("No reverse primers detected in mapping file."))
# Finding the forward primers, or rc of reverse primers indicates forward
# read. Finding the reverse primer, or rc of the forward primers, indicates
# the reverse read, so these sets are merged.
raw_forward_primers.update(raw_reverse_rc_primers)
raw_reverse_primers.update(raw_forward_rc_primers)
forward_primers = []
reverse_primers = []
for curr_primer in raw_forward_primers:
forward_primers.append(compile(''.join([iupac[symbol] for
symbol in curr_primer])))
for curr_primer in raw_reverse_primers:
reverse_primers.append(compile(''.join([iupac[symbol] for
symbol in curr_primer])))
return forward_primers, reverse_primers
示例15: process_fastq_single_end_read_file
# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_fastq_single_end_read_file(fastq_read_f,
fastq_barcode_f,
barcode_to_sample_id,
store_unassigned=False,
max_bad_run_length=0,
last_bad_quality_char='B',
min_per_read_length=75,
rev_comp=False,
rev_comp_barcode=False,
seq_max_N=0,
start_seq_id=0,
filter_bad_illumina_qual_digit=True,
log_f=None,
histogram_f=None,
barcode_correction_fn=None,
max_barcode_errors=1.5):
"""parses fastq single-end read file
"""
header_index = 0
sequence_index = 1
quality_index = 2
seq_id = start_seq_id
# prep data for logging
input_sequence_count = 0
count_barcode_not_in_map = 0
count_too_short = 0
count_too_many_N = 0
count_bad_illumina_qual_digit = 0
count_barcode_errors_exceed_max = 0
sequence_lengths = []
seqs_per_sample_counts = {}
for bc_data,read_data in izip(MinimalFastqParser(fastq_barcode_f,strict=False),
MinimalFastqParser(fastq_read_f,strict=False)):
input_sequence_count += 1
# Confirm match between barcode and read headers
if not check_header_match(bc_data[header_index],
read_data[header_index]):
raise FastqParseError,\
("Headers of barcode and read do not match. Can't continue. "
"Confirm that the barcode fastq and read fastq that you are "
"passing match one another.")
else:
header = read_data[header_index]
# Grab the barcode sequence
barcode = bc_data[sequence_index]
if rev_comp_barcode:
barcode = DNA.rc(barcode)
# Grab the read sequence
sequence = read_data[1]
# Grab the read quality
quality = read_data[2]
# correct the barcode (if applicable) and map to sample id
num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \
correct_barcode(barcode,barcode_to_sample_id,barcode_correction_fn)
# skip samples with too many errors
if (num_barcode_errors > max_barcode_errors):
count_barcode_errors_exceed_max += 1
continue
# skip unassignable samples unless otherwise requested
if sample_id == None:
if not store_unassigned:
count_barcode_not_in_map += 1
continue
else:
sample_id = 'Unassigned'
quality_filter_result, sequence, quality =\
quality_filter_sequence(header,
sequence,
quality,
max_bad_run_length,
last_bad_quality_char,
min_per_read_length,
seq_max_N,
filter_bad_illumina_qual_digit)
# process quality result
if quality_filter_result != 0:
# if the quality filter didn't pass record why and
# move on to the next record
if quality_filter_result == 1:
count_too_short += 1
elif quality_filter_result == 2:
count_too_many_N += 1
elif quality_filter_result == 3:
count_bad_illumina_qual_digit += 1
else:
raise ValueError,\
"Unknown quality filter result: %d" % quality_filter_result
continue
sequence_lengths.append(len(sequence))
try:
#.........这里部分代码省略.........