当前位置: 首页>>代码示例>>Python>>正文


Python DNA.rc方法代码示例

本文整理汇总了Python中cogent.DNA.rc方法的典型用法代码示例。如果您正苦于以下问题:Python DNA.rc方法的具体用法?Python DNA.rc怎么用?Python DNA.rc使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在cogent.DNA的用法示例。


在下文中一共展示了DNA.rc方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: process_uclust_pw_alignment_results

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_uclust_pw_alignment_results(fasta_pairs_lines,uc_lines):
    """ Process results of uclust search and align """
    alignments = get_next_two_fasta_records(fasta_pairs_lines)
    for hit in get_next_record_type(uc_lines,'H'):
        matching_strand = hit[4]
        if matching_strand == '-':
            strand_id = '-'
            target_rev_match = True
        elif matching_strand == '+':
            strand_id = '+'
            target_rev_match = False
        elif matching_strand == '.':
            # protein sequence, so no strand information
            strand_id = ''
            target_rev_match = False
        else:
            raise UclustParseError, "Unknown strand type: %s" % matching_strand
        uc_query_id = hit[8]
        uc_target_id = hit[9]
        percent_id = float(hit[3])
        
        fasta_pair = alignments.next()
        
        fasta_query_id = fasta_pair[0][0]
        aligned_query = fasta_pair[0][1]
        
        if fasta_query_id != uc_query_id:
            raise UclustParseError,\
             "Order of fasta and uc files do not match."+\
             " Got query %s but expected %s." %\
              (fasta_query_id, uc_query_id)
            
        fasta_target_id = fasta_pair[1][0]
        aligned_target = fasta_pair[1][1]
            
        if fasta_target_id != uc_target_id + strand_id:
            raise UclustParseError, \
             "Order of fasta and uc files do not match."+\
             " Got target %s but expected %s." %\
              (fasta_target_id, uc_target_id + strand_id)
            
        if target_rev_match:
            query_id = uc_query_id + ' RC'
            aligned_query = DNA.rc(aligned_query)
            target_id = uc_target_id
            aligned_target = DNA.rc(aligned_target)
        else:
            query_id = uc_query_id
            aligned_query = aligned_query
            target_id = uc_target_id
            aligned_target = aligned_target
            
        yield (query_id, target_id, aligned_query, aligned_target, percent_id)
开发者ID:Jorge-C,项目名称:qiime,代码行数:55,代码来源:uclust.py

示例2: parse_illumina_line

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def parse_illumina_line(l, barcode_length, rev_comp_barcode,
                        barcode_in_sequence=False):
    """Parses a single line of Illumina data
    """
    fields = l.strip().split(':')

    y_position_subfields = fields[4].split('#')
    y_position = int(y_position_subfields[0])
    sequence = fields[5]
    qual_string = fields[6]

    if barcode_in_sequence:
        barcode = sequence[:barcode_length]
        sequence = sequence[barcode_length:]
        qual_string = qual_string[barcode_length:]
    else:
        barcode = y_position_subfields[1][:barcode_length]

    if rev_comp_barcode:
        barcode = DNA.rc(barcode)

    result = {
        'Full description': ':'.join(fields[:5]),
        'Machine Name': fields[0],
        'Channel Number': int(fields[1]),
        'Tile Number': int(fields[2]),
        'X Position': int(fields[3]),
        'Y Position': y_position,
        'Barcode': barcode,
        'Full Y Position Field': fields[4],
        'Sequence': sequence,
        'Quality Score': qual_string}

    return result
开发者ID:Gaby1212,项目名称:qiime,代码行数:36,代码来源:parse.py

示例3: process_barcode_single_end_data

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len=6, rev_comp_bc1=False):
    """ Processes, writes single-end barcode data, parsed sequence
    
    read1_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of data
    rev_comp_bc1: reverse complement barcode before writing.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    bc_read = read1_data[sequence_index][:bc1_len]
    bc_qual = read1_data[quality_index][:bc1_len]
    if rev_comp_bc1:
        bc_read = DNA.rc(bc_read)
        bc_qual = bc_qual[::-1]

    bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual)
    output_bc_fastq.write(bc_lines)
    seq_lines = format_fastq_record(
        read1_data[header_index], read1_data[sequence_index][bc1_len:], read1_data[quality_index][bc1_len:]
    )
    output_fastq1.write(seq_lines)

    return
开发者ID:rob-knight,项目名称:qiime,代码行数:30,代码来源:extract_barcodes.py

示例4: parse_illumina_single_end_read_file

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def parse_illumina_single_end_read_file(read_file,barcode_length,\
    max_bad_run_length,quality_threshold,min_per_read_length,
    rev_comp,rev_comp_barcode,barcode_in_seq,barcode_max_N=0,seq_max_N=0):
    """Parses Illumina single-end read file
    """
    
    for read_line in read_file:
        read = parse_illumina_line(read_line,barcode_length,
                                   rev_comp_barcode,barcode_in_seq)
        
        read_desc = illumina_read_description_from_read_data(read)
        
        read_barcode = read['Barcode']
        
        if read_barcode.count('N') > barcode_max_N:
           continue
        
        seq, qual = read_qual_score_filter(\
         read['Sequence'], read['Quality Score'],\
         max_bad_run_length, quality_threshold)
         
        if (len(seq) < min_per_read_length) or (seq.count('N') > seq_max_N):
            continue
            
        if rev_comp:
            seq = DNA.rc(seq)
            qual = qual[::-1]
        
        yield read_desc, read_barcode, seq, qual
开发者ID:Ecogenomics,项目名称:FrankenQIIME,代码行数:31,代码来源:split_libraries_illumina.py

示例5: rc_fasta_lines

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def rc_fasta_lines(fasta_lines, seq_desc_mapper=append_rc):
    """
    """
    for seq_id, seq in parse_fasta(fasta_lines):
        seq_id = seq_desc_mapper(seq_id)
        seq = DNA.rc(seq.upper())
        yield seq_id, seq
    return
开发者ID:Gaby1212,项目名称:qiime,代码行数:10,代码来源:adjust_seq_orientation.py

示例6: get_rev_primer_seqs

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def get_rev_primer_seqs(mapping_fp):
    """ Parses mapping file to get dictionary of SampleID:Rev primer
    mapping_fp:  mapping filepath
    """
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_fp, has_barcodes=False,
         disable_primer_check=True)
        
    if errors:
        for curr_err in errors:
            if curr_err.startswith("Duplicate SampleID"):
                raise ValueError,('Errors were found with mapping file, '+\
                 'please run check_id_map.py to identify problems.')
         
    # create dict of dicts with SampleID:{each header:mapping data}
    
    id_map = {}
    
    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}
        
    
    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]
    
    reverse_primers = {}
    
    for curr_id in id_map.keys():
        try:
            reverse_primers[curr_id] =\
             [DNA.rc(curr_rev_primer) for curr_rev_primer in\
             id_map[curr_id]['ReversePrimer'].split(',')]
        except KeyError:
            raise KeyError,("Reverse primer not found in mapping file, "+\
             "please include a 'ReversePrimer' column.")

             
    # Check for valid reverse primers
    # Will have been detected as warnings from mapping file
    for curr_err in errors:
        if curr_err.startswith("Invalid DNA sequence detected"):
            raise ValueError,("Problems found with reverse primers, please "+\
             "check mapping file with check_id_map.py")
    
    return reverse_primers
开发者ID:rob-knight,项目名称:qiime,代码行数:48,代码来源:truncate_reverse_primer.py

示例7: parse_illumina_paired_end_read_files

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def parse_illumina_paired_end_read_files(read1_file,read2_file,barcode_length,\
    max_bad_run_length,quality_threshold,min_per_read_length,rev_comp_barcode,\
    barcode_max_N=0,seq_max_N=0):
    """Parses Illumina paired-end read file pair
    """
    
    for read1_line, read2_line in izip(read1_file,read2_file):
        read1 = parse_illumina_line(read1_line,barcode_length,rev_comp_barcode)
        read2 = parse_illumina_line(read2_line,barcode_length,rev_comp_barcode)
        
        read1_desc = illumina_read_description_from_read_data(read1)
        read2_desc = illumina_read_description_from_read_data(read2)
        
        read1_barcode = read1['Barcode']
        read2_barcode = read2['Barcode']
        if (read1_barcode.count('N') > barcode_max_N) or \
           (read2_barcode.count('N') > barcode_max_N):
           continue
        
        if read1_desc != read2_desc:
            raise IlluminaParseError, \
              "Error in sequence files, descriptions of"+\
              " corresponding lines are not compatible: %s != %s" %\
              (read1_desc, read2_desc)
        assert read1_barcode == read2_barcode
        
        seq1, qual1 = read_qual_score_filter(\
         read1['Sequence'], read1['Quality Score'],\
         max_bad_run_length, quality_threshold)
        if (len(seq1) < min_per_read_length):
            continue
            
        seq2, qual2 = read_qual_score_filter(\
         read2['Sequence'], read2['Quality Score'],\
         max_bad_run_length, quality_threshold)
        if (len(seq2) < min_per_read_length):
            continue
            
        seq = seq1 + DNA.rc(seq2)
        # If the total number of Ns is more than the max 
        # allowed ignore this sequence
        if seq.count('N') > seq_max_N:
            continue
        qual = qual1 + qual2[::-1]
        
        yield read1_desc, read1_barcode, seq, qual
开发者ID:Ecogenomics,项目名称:FrankenQIIME,代码行数:48,代码来源:split_libraries_illumina.py

示例8: process_fastq_single_end_read_file

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_fastq_single_end_read_file(fastq_read_f,
                                       fastq_barcode_f,
                                       barcode_to_sample_id,
                                       store_unassigned=False,
                                       max_bad_run_length=0,
                                       phred_quality_threshold=2,
                                       min_per_read_length_fraction=0.75,
                                       rev_comp=False,
                                       rev_comp_barcode=False,
                                       seq_max_N=0,
                                       start_seq_id=0,
                                       filter_bad_illumina_qual_digit=False,
                                       log_f=None,
                                       histogram_f=None,
                                       barcode_correction_fn=None,
                                       max_barcode_errors=1.5,
                                       strict_header_match=True,
                                       phred_to_ascii_f=None):
    """parses fastq single-end read file
    """
    header_index = 0
    sequence_index = 1
    quality_index = 2
    
    seq_id = start_seq_id
    # grab the first lines and then seek back to the beginning of the file
    try:
        fastq_read_f_line1 = fastq_read_f.readline()
        fastq_read_f_line2 = fastq_read_f.readline()
        fastq_read_f.seek(0)
    except AttributeError:
        fastq_read_f_line1 = fastq_read_f[0]
        fastq_read_f_line2 = fastq_read_f[1]
    
    # determine the version of casava that was used to generate the fastq
    # to determine how to compare header lines and decode ascii phred scores
    post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1)
    if post_casava_v180:
        check_header_match_f = check_header_match_180_or_later
        if phred_to_ascii_f == None:
            phred_to_ascii_f = phred_to_ascii33
    else:
        check_header_match_f = check_header_match_pre180
        if phred_to_ascii_f == None:
            phred_to_ascii_f = phred_to_ascii64
    
    # determine the last unacceptable quality character
    if phred_quality_threshold != None:
        last_bad_quality_char = phred_to_ascii_f(phred_quality_threshold)
    else:
        # disable quality filter
        last_bad_quality_char = ''
    
    # compute the barcode length, if they are all the same. 
    # this is useful for selecting a subset of the barcode read
    # if it's too long (e.g., for technical reasons on the sequencer)
    barcode_lengths = set([len(bc) for bc, sid in barcode_to_sample_id.items()])
    if len(barcode_lengths) == 1:
        barcode_length = barcode_lengths.pop()
    else:
        barcode_length = None
    
    # compute the minimum read length as a fraction of the length of the input read
    min_per_read_length = min_per_read_length_fraction * len(fastq_read_f_line2)
    
    # prep data for logging
    input_sequence_count = 0
    count_barcode_not_in_map = 0
    count_too_short = 0
    count_too_many_N = 0
    count_bad_illumina_qual_digit = 0
    count_barcode_errors_exceed_max = 0
    sequence_lengths = []
    seqs_per_sample_counts = {}
    for bc_data,read_data in izip(MinimalFastqParser(fastq_barcode_f,strict=False),
                                  MinimalFastqParser(fastq_read_f,strict=False)):
        input_sequence_count += 1
        # Confirm match between barcode and read headers
        if strict_header_match and \
           (not check_header_match_f(bc_data[header_index],read_data[header_index])):
            raise FastqParseError,\
             ("Headers of barcode and read do not match. Can't continue. "
              "Confirm that the barcode fastq and read fastq that you are "
              "passing match one another.")
        else:
            header = read_data[header_index]
        
        # Grab the barcode sequence
        if barcode_length:
            # because thirteen cycles are sometimes used for 
            # techical reasons, this step looks only at the 
            # first tweleve bases. note that the barcode is
            # rev-comp'ed after this step if requested since
            # the thirteen base is a technical artefact, not
            # barcode sequence.
            barcode = bc_data[sequence_index][:barcode_length]
        else:
            barcode = bc_data[sequence_index]
        if rev_comp_barcode:
            barcode = DNA.rc(barcode)
#.........这里部分代码省略.........
开发者ID:Jorge-C,项目名称:qiime,代码行数:103,代码来源:split_libraries_fastq.py

示例9: main

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_id = opts.sample_id
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors
    
    # if this is not a demultiplexed run, 
    if barcode_type == 'not-barcoded':
        if sample_id == None:
            option_parser.error("If not providing barcode reads (because "
            "your data is not multiplexed), must provide a --sample_id.")
        barcode_read_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps == None:
        option_parser.error("Must provide --barcode_fps if "
                            "--barcode_type is not 'not-barcoded'")
    else:
        pass
    
    phred_offset = opts.phred_offset
    if phred_offset != None:
        try:
            phred_to_ascii_f = phred_to_ascii_fs[phred_offset]
        except KeyError:
            # shouldn't be able to get here, but we'll stay on the
            # safe side
            opption_parser.error(\
             "Only valid phred offsets are: %s" %\
             ' '.join(phred_to_ascii_fs.keys()))
    else:
        # let split_libraries_fastq.process_fastq_single_end_read_file 
        # figure it out...
        phred_to_ascii_f = None
    
    if opts.last_bad_quality_char != None:
        option_parser.error('--last_bad_quality_char is no longer supported. '
         'Use -q instead (see option help text by passing -h)')
    
    if not (0 <= min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be between '
         '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction)
    
    try:
        barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type]
    except KeyError:
        barcode_correction_fn = None
    
    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)
    
    if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1:
        option_parser.error("Same number of sequence, barcode and mapping files must be provided.")
    
    output_dir = opts.output_dir
    create_dir(output_dir)
    
    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp,'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir
    
    if store_qual_scores:
        qual_f = open(qual_fp_temp,'w')
        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def qual_writer(h,q):
            qual_f.write('>%s\n%s\n' % (h,q))
    else:
        def qual_writer(h,q):
            pass
    
    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp,'w')
        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def fastq_writer(h,s,q):
            output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h,s,q))
    else:
#.........这里部分代码省略.........
开发者ID:DDomogala3,项目名称:qiime,代码行数:103,代码来源:split_libraries_fastq.py

示例10: get_primers

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def get_primers(header, mapping_data):
    """ Returns lists of forward/reverse primer regular expression generators
    
    header:  list of strings of header data.
    mapping_data:  list of lists of mapping data
    
    Will raise error if either the LinkerPrimerSequence or ReversePrimer fields
        are not present
    """

    if "LinkerPrimerSequence" in header:
        primer_ix = header.index("LinkerPrimerSequence")
    else:
        raise IndexError, ("Mapping file is missing LinkerPrimerSequence field.")
    if "ReversePrimer" in header:
        rev_primer_ix = header.index("ReversePrimer")
    else:
        raise IndexError, ("Mapping file is missing ReversePrimer field.")

    iupac = {
        "A": "A",
        "T": "T",
        "G": "G",
        "C": "C",
        "R": "[AG]",
        "Y": "[CT]",
        "S": "[GC]",
        "W": "[AT]",
        "K": "[GT]",
        "M": "[AC]",
        "B": "[CGT]",
        "D": "[AGT]",
        "H": "[ACT]",
        "V": "[ACG]",
        "N": "[ACGT]",
    }

    raw_forward_primers = set([])
    raw_forward_rc_primers = set([])
    raw_reverse_primers = set([])
    raw_reverse_rc_primers = set([])

    for line in mapping_data:
        # Split on commas to handle pool of primers
        raw_forward_primers.update([upper(primer).strip() for primer in line[primer_ix].split(",")])
        raw_forward_rc_primers.update([DNA.rc(primer) for primer in raw_forward_primers])
        raw_reverse_primers.update([upper(primer).strip() for primer in line[rev_primer_ix].split(",")])
        raw_reverse_rc_primers.update([DNA.rc(primer) for primer in raw_reverse_primers])

    if not raw_forward_primers:
        raise ValueError, ("No forward primers detected in mapping file.")
    if not raw_reverse_primers:
        raise ValueError, ("No reverse primers detected in mapping file.")

    # Finding the forward primers, or rc of reverse primers indicates forward
    # read. Finding the reverse primer, or rc of the forward primers, indicates
    # the reverse read, so these sets are merged.
    raw_forward_primers.update(raw_reverse_rc_primers)
    raw_reverse_primers.update(raw_forward_rc_primers)

    forward_primers = []
    reverse_primers = []
    for curr_primer in raw_forward_primers:
        forward_primers.append(compile("".join([iupac[symbol] for symbol in curr_primer])))
    for curr_primer in raw_reverse_primers:
        reverse_primers.append(compile("".join([iupac[symbol] for symbol in curr_primer])))

    return forward_primers, reverse_primers
开发者ID:rob-knight,项目名称:qiime,代码行数:70,代码来源:extract_barcodes.py

示例11: process_barcode_in_label

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_barcode_in_label(
    read1_data,
    read2_data,
    output_bc_fastq,
    bc1_len=6,
    bc2_len=6,
    rev_comp_bc1=False,
    rev_comp_bc2=False,
    char_delineator=":",
):
    """ Reads data from one or two fastq labels, writes output barcodes file. 
    
    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores, False if no read 2.
    output_bc_fastq: open output fastq filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    char_delineator: Specify character that immediately precedes the barcode
        for input_type of barcode_in_label.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    # Check for char_delineator in sequence
    try:
        bc1_read = read1_data[header_index].split(char_delineator)[-1][0:bc1_len]
    # If there is an index error, it means the char_delineator wasn't found
    except IndexError:
        raise IndexError, (
            "Found sequence lacking character delineator. "
            "Sequence header %s, character delineator %s" % (read1_data[header_index], char_delineator)
        )

    # Create fake quality scores
    bc1_qual = "F" * len(bc1_read)
    if rev_comp_bc1:
        bc1_read = DNA.rc(bc1_read)

    if read2_data:
        bc2_read = read2_data[header_index].strip().split(char_delineator)[-1][0:bc2_len]
        bc2_qual = "F" * len(bc2_read)
        if rev_comp_bc2:
            bc2_read = DNA.rc(bc2_read)
    else:
        bc2_read = ""
        bc2_qual = ""

    if not bc1_read and not bc2_read:
        raise ValueError, (
            "Came up with empty barcode sequence, please check "
            "character delineator with -s, and fastq label "
            "%s" % read1_data[header_index]
        )

    bc_lines = format_fastq_record(read1_data[header_index], bc1_read + bc2_read, bc1_qual + bc2_qual)

    output_bc_fastq.write(bc_lines)

    return
开发者ID:rob-knight,项目名称:qiime,代码行数:65,代码来源:extract_barcodes.py

示例12: process_barcode_paired_stitched

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_barcode_paired_stitched(
    read_data,
    output_bc_fastq,
    output_fastq,
    bc1_len=6,
    bc2_len=6,
    rev_comp_bc1=False,
    rev_comp_bc2=False,
    attempt_read_orientation=False,
    forward_primers=None,
    reverse_primers=None,
    output_bc_not_oriented=None,
    fastq_out_not_oriented=None,
    switch_bc_order=False,
):
    """ Processes stitched barcoded reads, writes barcode, parsed stitched read
    
    read_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of read1 stitched data
    bc2_len: length of barcode to remove from end of read2 stitched data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads 
        according to the forward primers in the mapping file. If primer is 
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current 
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq_out_not_oriented: Open filepath to write reads where primers
        can't be found when attempt_read_orientation is True.
    switch_bc_order: Normally, barcode 1 will be written first, followed by
        barcode 2 in a combined output fastq file. If True, the order will be 
        reversed. Only applies to stitched reads processing, as other barcode
        orders are dictated by the the parameter chosen for the fastq files.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    read_seq = read_data[sequence_index]
    read_qual = read_data[quality_index]

    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        for curr_primer in forward_primers:
            if curr_primer.search(read_data[sequence_index]):
                found_primer_match = True
                break
        if not found_primer_match:
            for curr_primer in reverse_primers:
                if curr_primer.search(read_data[sequence_index]):
                    read_seq = DNA.rc(read_seq)
                    read_qual = read_qual[::-1]
                    found_primer_match = True
                    break

    if not found_primer_match and attempt_read_orientation:
        output_bc = output_bc_not_oriented
        output_read = fastq_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read = output_fastq

    bc_read1 = read_seq[0:bc1_len]
    bc_read2 = read_seq[-bc2_len:]
    bc_qual1 = read_qual[0:bc1_len]
    bc_qual2 = read_qual[-bc2_len:]

    if rev_comp_bc1:
        bc_read1 = DNA.rc(bc_read1)
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = DNA.rc(bc_read2)
        bc_qual2 = bc_qual2[::-1]

    if switch_bc_order:
        bc_read1, bc_read2 = bc_read2, bc_read1
        bc_qual1, bc_qual2 = bc_qual2, bc_qual1

    bc_lines = format_fastq_record(read_data[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2)
    output_bc.write(bc_lines)
    seq_lines = format_fastq_record(read_data[header_index], read_seq[bc1_len:-bc2_len], read_qual[bc1_len:-bc2_len])
    output_read.write(seq_lines)

    return
开发者ID:rob-knight,项目名称:qiime,代码行数:95,代码来源:extract_barcodes.py

示例13: process_barcode_paired_end_data

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_barcode_paired_end_data(
    read1_data,
    read2_data,
    output_bc_fastq,
    output_fastq1,
    output_fastq2,
    bc1_len=6,
    bc2_len=6,
    rev_comp_bc1=False,
    rev_comp_bc2=False,
    attempt_read_orientation=False,
    forward_primers=None,
    reverse_primers=None,
    output_bc_not_oriented=None,
    fastq1_out_not_oriented=None,
    fastq2_out_not_oriented=None,
):
    """ Processes, writes paired-end barcode data, parsed sequences
    
    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads 1 filepath
    output_fastq2: open output fastq reads 2 filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads 
        according to the forward primers in the mapping file. If primer is 
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current 
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq1_out_not_oriented: Open filepath to write reads 1 where primers
        can't be found when attempt_read_orientation is True.
    fastq2_out_not_oriented: Open filepath to write reads 2 where primers
        can't be found when attempt_read_orientation is True.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        # First check forward primers
        for curr_primer in forward_primers:
            if curr_primer.search(read1_data[sequence_index]):
                read1 = read1_data
                read2 = read2_data
                found_primer_match = True
                break
            if curr_primer.search(read2_data[sequence_index]):
                read1 = read2_data
                read2 = read1_data
                found_primer_match = True
                break
        # Check reverse primers if forward primers not found
        if not found_primer_match:
            for curr_primer in reverse_primers:
                if curr_primer.search(read1_data[sequence_index]):
                    read1 = read2_data
                    read2 = read1_data
                    found_primer_match = True
                    break
                if curr_primer.search(read2_data[sequence_index]):
                    read1 = read1_data
                    read2 = read2_data
                    found_primer_match = True
                    break
    else:
        read1 = read1_data
        read2 = read2_data

    if not found_primer_match and attempt_read_orientation:
        read1 = read1_data
        read2 = read2_data
        output_bc = output_bc_not_oriented
        output_read1 = fastq1_out_not_oriented
        output_read2 = fastq2_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read1 = output_fastq1
        output_read2 = output_fastq2

    bc_read1 = read1[sequence_index][0:bc1_len]
    bc_read2 = read2[sequence_index][0:bc2_len]
    bc_qual1 = read1[quality_index][0:bc1_len]
    bc_qual2 = read2[quality_index][0:bc2_len]
    if rev_comp_bc1:
        bc_read1 = DNA.rc(bc_read1)
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = DNA.rc(bc_read2)
#.........这里部分代码省略.........
开发者ID:rob-knight,项目名称:qiime,代码行数:103,代码来源:extract_barcodes.py

示例14: get_primers

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def get_primers(header,
                mapping_data):
    """ Returns lists of forward/reverse primer regular expression generators

    header:  list of strings of header data.
    mapping_data:  list of lists of mapping data

    Will raise error if either the LinkerPrimerSequence or ReversePrimer fields
        are not present
    """

    if "LinkerPrimerSequence" in header:
        primer_ix = header.index("LinkerPrimerSequence")
    else:
        raise IndexError(
            ("Mapping file is missing LinkerPrimerSequence field."))
    if "ReversePrimer" in header:
        rev_primer_ix = header.index("ReversePrimer")
    else:
        raise IndexError(("Mapping file is missing ReversePrimer field."))

    iupac = {'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'R': '[AG]', 'Y': '[CT]',
             'S': '[GC]', 'W': '[AT]', 'K': '[GT]', 'M': '[AC]', 'B': '[CGT]',
             'D': '[AGT]', 'H': '[ACT]', 'V': '[ACG]', 'N': '[ACGT]'}

    raw_forward_primers = set([])
    raw_forward_rc_primers = set([])
    raw_reverse_primers = set([])
    raw_reverse_rc_primers = set([])

    for line in mapping_data:
        # Split on commas to handle pool of primers
        raw_forward_primers.update([upper(primer).strip() for
                                    primer in line[primer_ix].split(',')])
        raw_forward_rc_primers.update([DNA.rc(primer) for
                                       primer in raw_forward_primers])
        raw_reverse_primers.update([upper(primer).strip() for
                                    primer in line[rev_primer_ix].split(',')])
        raw_reverse_rc_primers.update([DNA.rc(primer) for
                                       primer in raw_reverse_primers])

    if not raw_forward_primers:
        raise ValueError(("No forward primers detected in mapping file."))
    if not raw_reverse_primers:
        raise ValueError(("No reverse primers detected in mapping file."))

    # Finding the forward primers, or rc of reverse primers indicates forward
    # read. Finding the reverse primer, or rc of the forward primers, indicates
    # the reverse read, so these sets are merged.
    raw_forward_primers.update(raw_reverse_rc_primers)
    raw_reverse_primers.update(raw_forward_rc_primers)

    forward_primers = []
    reverse_primers = []
    for curr_primer in raw_forward_primers:
        forward_primers.append(compile(''.join([iupac[symbol] for
                                                symbol in curr_primer])))
    for curr_primer in raw_reverse_primers:
        reverse_primers.append(compile(''.join([iupac[symbol] for
                                                symbol in curr_primer])))

    return forward_primers, reverse_primers
开发者ID:Gaby1212,项目名称:qiime,代码行数:64,代码来源:extract_barcodes.py

示例15: process_fastq_single_end_read_file

# 需要导入模块: from cogent import DNA [as 别名]
# 或者: from cogent.DNA import rc [as 别名]
def process_fastq_single_end_read_file(fastq_read_f,
                                       fastq_barcode_f,
                                       barcode_to_sample_id,
                                       store_unassigned=False,
                                       max_bad_run_length=0,
                                       last_bad_quality_char='B',
                                       min_per_read_length=75,
                                       rev_comp=False,
                                       rev_comp_barcode=False,
                                       seq_max_N=0,
                                       start_seq_id=0,
                                       filter_bad_illumina_qual_digit=True,
                                       log_f=None,
                                       histogram_f=None,
                                       barcode_correction_fn=None,
                                       max_barcode_errors=1.5):
    """parses fastq single-end read file
    """
    header_index = 0
    sequence_index = 1
    quality_index = 2
    
    seq_id = start_seq_id
    
    # prep data for logging
    input_sequence_count = 0
    count_barcode_not_in_map = 0
    count_too_short = 0
    count_too_many_N = 0
    count_bad_illumina_qual_digit = 0
    count_barcode_errors_exceed_max = 0
    sequence_lengths = []
    seqs_per_sample_counts = {}
    
    for bc_data,read_data in izip(MinimalFastqParser(fastq_barcode_f,strict=False),
                                  MinimalFastqParser(fastq_read_f,strict=False)):
        input_sequence_count += 1
        # Confirm match between barcode and read headers
        if not check_header_match(bc_data[header_index],
                                  read_data[header_index]):
            raise FastqParseError,\
             ("Headers of barcode and read do not match. Can't continue. "
              "Confirm that the barcode fastq and read fastq that you are "
              "passing match one another.")
        else:
            header = read_data[header_index]
        
        # Grab the barcode sequence
        barcode = bc_data[sequence_index]
        if rev_comp_barcode:
            barcode = DNA.rc(barcode)
        # Grab the read sequence
        sequence = read_data[1]
        # Grab the read quality
        quality = read_data[2]
        
        # correct the barcode (if applicable) and map to sample id
        num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \
         correct_barcode(barcode,barcode_to_sample_id,barcode_correction_fn)
        # skip samples with too many errors
        if (num_barcode_errors > max_barcode_errors):
          count_barcode_errors_exceed_max += 1
          continue
        
        # skip unassignable samples unless otherwise requested
        if sample_id == None:
          if not store_unassigned:
              count_barcode_not_in_map += 1
              continue
          else:
              sample_id = 'Unassigned'
        
        quality_filter_result, sequence, quality =\
          quality_filter_sequence(header,
                                  sequence,
                                  quality,
                                  max_bad_run_length,
                                  last_bad_quality_char,
                                  min_per_read_length,
                                  seq_max_N,
                                  filter_bad_illumina_qual_digit)
        
        # process quality result
        if quality_filter_result != 0:
            # if the quality filter didn't pass record why and 
            # move on to the next record
            if quality_filter_result == 1:
                count_too_short += 1
            elif quality_filter_result == 2:
                count_too_many_N += 1
            elif quality_filter_result == 3:
                count_bad_illumina_qual_digit += 1
            else:
                raise ValueError,\
                 "Unknown quality filter result: %d" % quality_filter_result
            continue
        
        sequence_lengths.append(len(sequence))
        
        try:
#.........这里部分代码省略.........
开发者ID:Ecogenomics,项目名称:FrankenQIIME,代码行数:103,代码来源:split_libraries_fastq.py


注:本文中的cogent.DNA.rc方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。