当前位置: 首页>>代码示例>>C++>>正文


C++ BWTInterval类代码示例

本文整理汇总了C++中BWTInterval的典型用法代码示例。如果您正苦于以下问题:C++ BWTInterval类的具体用法?C++ BWTInterval怎么用?C++ BWTInterval使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了BWTInterval类的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: assert

// Update the bit vector with the kmers that were assembled into str
void MetAssemble::markSequenceKmers(const std::string& str)
{
    assert(str.size() >= m_parameters.kmer);
    size_t n = str.size() - m_parameters.kmer + 1;

    for(size_t i = 0; i < n; ++i)
    {
        std::string kseq = str.substr(i, m_parameters.kmer);
        BWTInterval interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, kseq);
        if(interval.isValid())
        {
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
                m_parameters.pBitVector->updateCAS(j, false, true);
        }

        // Mark the reverse complement k-mers too
        std::string rc_kseq = reverseComplement(kseq);
        interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, rc_kseq);
        if(interval.isValid())
        {
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
                m_parameters.pBitVector->updateCAS(j, false, true);
        }
    }
}
开发者ID:Buttonwood,项目名称:sga,代码行数:26,代码来源:MetAssembleProcess.cpp

示例2: findInterval

// Find the interval in pBWT corresponding to w
// using a cache of short k-mer intervals to avoid
// some of the iterations
BWTInterval BWTAlgorithms::findIntervalWithCache(const BWT* pBWT, const BWTIntervalCache* pIntervalCache, const std::string& w)
{
    size_t cacheLen = pIntervalCache->getCachedLength();
    if(w.size() < cacheLen)
        return findInterval(pBWT, w);

    // Compute the interval using the cache for the last k bases
    int len = w.size();
    int j = len - cacheLen;

    // Check whether the input string has a '$' in it.
    // We don't cache these strings so if it does
    // we have to do a direct lookup
    if(index(w.c_str() + j, '$') != NULL)
        return findInterval(pBWT, w);

    BWTInterval interval = pIntervalCache->lookup(w.c_str() + j);
    j -= 1;
    for(;j >= 0; --j)
    {
        char curr = w[j];
        updateInterval(interval, curr, pBWT);
        if(!interval.isValid())
            return interval;
    }
    return interval;
}
开发者ID:Nanapeel,项目名称:StriDe,代码行数:30,代码来源:BWTAlgorithms.cpp

示例3: assert

size_t BWTAlgorithms::countSequenceOccurrencesSingleStrand(const std::string& w, const BWTIndexSet& indices)
{
    assert(indices.pBWT != NULL);
    assert(indices.pCache != NULL);

    BWTInterval interval = findIntervalWithCache(indices.pBWT, indices.pCache, w);
    return interval.isValid() ? interval.size() : 0;
}
开发者ID:zhangzhen,项目名称:sga,代码行数:8,代码来源:BWTAlgorithms.cpp

示例4: exit

// Add a seed read to the cluster. Overlaps will be found for 
// each seed read to grow the cluster
ClusterNode ReadCluster::addSeed(const std::string& sequence, bool bCheckInIndex)
{
    // Check if this read is a substring
    SeqRecord tempRecord;
    tempRecord.id = "cluster-seed";
    tempRecord.seq = sequence;

    OverlapBlockList tempBlockList;
    OverlapResult overlapResult = m_pOverlapper->alignReadDuplicate(tempRecord, &tempBlockList);
    if(overlapResult.isSubstring)
    {
        // If bCheckInIndex is true, then we are extending clusters from reads that are in the FM-index
        // In this case, seeding a substring read is an error and we abort. If the seed is NOT in the index
        // we just emit a warning and ignore the seed
        if(bCheckInIndex)
        {
            std::cerr << "Error: The seed used for sga-cluster-extend is a substring of some read.\n";
            std::cerr << "Please run sga rmdup before cluster\n";
            std::cerr << "Sequence: " << sequence << "\n";
            exit(1);
        }
        else
        {
            std::cerr << "Warning: The cluster sequence to extend is a substring of some read. This seed is being skipped\n";
            std::cerr << "Sequence: " << sequence << "\n";
            ClusterNode emptyNode;

            // Set an invalid interval
            emptyNode.interval.lower = 0;
            emptyNode.interval.upper = -1;
            return emptyNode;
        }
    }

    // Find the interval in the fm-index containing the read
    const BWT* pBWT = m_pOverlapper->getBWT();
    BWTInterval readInterval = BWTAlgorithms::findInterval(pBWT, sequence);
    BWTAlgorithms::updateInterval(readInterval, '$', pBWT);

    // When building primary clusters, we require each read to be in the index.
    if(bCheckInIndex)
    {
        if(!readInterval.isValid())
        {
            std::cerr << "sga cluster error: The seed read is not part of the FM-index.\n";
            exit(EXIT_FAILURE);
        }
    }

    ClusterNode node;
    node.sequence = sequence;
    node.interval = readInterval;
    node.isReverseInterval = false;
    m_usedIndex.insert(readInterval.lower);
    m_queue.push(node);
    return node;
}
开发者ID:alicarea,项目名称:sga,代码行数:59,代码来源:ReadCluster.cpp

示例5: extractRankedPrefixes

// Extract all strings found from a backwards search starting at the given interval
RankedPrefixVector BWTAlgorithms::extractRankedPrefixes(const BWT* pBWT, BWTInterval interval)
{
    std::string curr;
    RankedPrefixVector output;
    output.reserve(interval.size());
    _extractRankedPrefixes(pBWT, interval, curr, &output);
    return output;
}
开发者ID:Nanapeel,项目名称:StriDe,代码行数:9,代码来源:BWTAlgorithms.cpp

示例6: PROFILE_FUNC

SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query, size_t k, 
                                                        int min_overlap, double min_identity,
                                                        int bandwidth, const BWTIndexSet& indices)
{
    PROFILE_FUNC("OverlapHaplotypeBuilder::retrieveMatches")
    assert(indices.pBWT != NULL);
    assert(indices.pSSA != NULL);

    int64_t max_interval_size = 200;
    SequenceOverlapPairVector overlap_vector;

    // Use the FM-index to look up intervals for each kmer of the read. Each index
    // in the interval is stored individually in the KmerMatchMap. We then
    // backtrack to map these kmer indices to read IDs. As reads can share
    // multiple kmers, we use the map to avoid redundant lookups.
    // There is likely a faster algorithm which performs direct decompression
    // of the read sequences without having to expand the intervals to individual
    // indices. The current algorithm suffices for now.
    KmerMatchMap prematchMap;
    size_t num_kmers = query.size() - k + 1;
    for(size_t i = 0; i < num_kmers; ++i)
    {
        std::string kmer = query.substr(i, k);
        BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer);
        if(interval.isValid() && interval.size() < max_interval_size) 
        {
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), false };
                prematchMap.insert(std::make_pair(match, false));
            }
        }

        kmer = reverseComplement(kmer);
        interval = BWTAlgorithms::findInterval(indices, kmer);
        if(interval.isValid() && interval.size() < max_interval_size) 
        {
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), true };
                prematchMap.insert(std::make_pair(match, false));
            }
        }
    }

    // Backtrack through the kmer indices to turn them into read indices.
    // This mirrors the calcSA function in SampledSuffixArray except we mark each entry
    // as visited once it is processed.
    KmerMatchSet matches;
    for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter)
    {
        // This index has been visited
        if(iter->second)
            continue;

        // Mark this as visited
        iter->second = true;

        // Backtrack the index until we hit the starting symbol
        KmerMatch out_match = iter->first;
        while(1) 
        {
            char b = indices.pBWT->getChar(out_match.index);
            out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1);

            // Check if the hash indicates we have visited this index. If so, stop the backtrack
            KmerMatchMap::iterator find_iter = prematchMap.find(out_match);
            if(find_iter != prematchMap.end())
            {
                // We have processed this index already
                if(find_iter->second)
                    break;
                else
                    find_iter->second = true;
            }

            if(b == '$')
            {
                // We've found the lexicographic index for this read. Turn it into a proper ID
                out_match.index = indices.pSSA->lookupLexoRank(out_match.index);
                matches.insert(out_match);
                break;
            }
        }
    }

    // Refine the matches by computing proper overlaps between the sequences
    // Use the overlaps that meet the thresholds to build a multiple alignment
    for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter)
    {
        std::string match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);
        if(iter->is_reverse)
            match_sequence = reverseComplement(match_sequence);
        
        // Ignore identical matches
        if(match_sequence == query)
            continue;

        // Compute the overlap. If the kmer match occurs a single time in each sequence we use
        // the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper.
//.........这里部分代码省略.........
开发者ID:csf-ngs,项目名称:sga,代码行数:101,代码来源:KmerOverlaps.cpp

示例7: PROFILE_FUNC

// Extract reads from an FM-index that have a k-mer match to any given haplotypes
// Returns true if the reads were successfully extracted, false if there are 
// more reads than maxReads
bool HapgenUtil::extractHaplotypeReads(const StringVector& haplotypes, 
                                       const BWTIndexSet& indices,
                                       int k,
                                       bool doReverse,
                                       size_t maxReads,
                                       int64_t maxIntervalSize,
                                       SeqRecordVector* pOutReads, 
                                       SeqRecordVector* pOutMates)
{
    PROFILE_FUNC("HapgenUtil::extractHaplotypeReads")
    // Extract the set of reads that have at least one kmer shared with these haplotypes
    // This is a bit of a lengthy procedure with a few steps:
    // 1) extract all the kmers in the haplotypes
    // 2) find the intervals for the kmers in the fm-index
    // 3) compute the set of read indices of the reads from the intervals (using the sampled suffix array)
    // 4) finally, extract the read sequences from the index
    // Make a set of kmers from the haplotypes
    std::set<std::string> kmerSet;
    for(size_t i = 0; i < haplotypes.size(); ++i)
    {
        const std::string& h = haplotypes[i];
        if((int)h.size() < k)
            continue;

        for(size_t j = 0; j < h.size() - k + 1; ++j)
        {
            std::string ks = h.substr(j, k);
            if(doReverse)
                ks = reverseComplement(ks);
            kmerSet.insert(ks);
        }
    }

    // Compute suffix array intervals for the kmers
    std::vector<BWTInterval> intervals;
    for(std::set<std::string>::const_iterator iter = kmerSet.begin(); iter != kmerSet.end(); ++iter)
    {
        BWTInterval interval = BWTAlgorithms::findInterval(indices, *iter);
        if(interval.size() < maxIntervalSize)
            intervals.push_back(interval);
    }

    // Compute the set of reads ids 
    std::set<int64_t> readIndices;
    for(size_t i = 0; i < intervals.size(); ++i)
    {
        BWTInterval interval = intervals[i];
        for(int64_t j = interval.lower; j <= interval.upper; ++j)
        {
            // Get index from sampled suffix array
            SAElem elem = indices.pSSA->calcSA(j, indices.pBWT);
            readIndices.insert(elem.getID());
        }
    }

    // Check if we have hit the limit of extracting too many reads
    if(readIndices.size() > maxReads)
        return false;

    for(std::set<int64_t>::const_iterator iter = readIndices.begin(); iter != readIndices.end(); ++iter)
    {
        int64_t idx = *iter;
        
        // Extract the read
        std::stringstream namer;
        namer << "idx-" << idx;
        SeqRecord record;
        record.id = namer.str();
        record.seq = BWTAlgorithms::extractString(indices.pBWT, idx);

        assert(indices.pQualityTable != NULL);
        record.qual = indices.pQualityTable->getQualityString(idx, record.seq.length());
        if(!record.seq.empty())
            pOutReads->push_back(record);

        // Optionally extract its mate
        // If the index is constructed properly, 
        // paired reads are in adjacent indices with the
        // first read at even indices
        if(pOutMates != NULL)
        {
            int64_t mateIdx = idx;
            if(idx % 2 == 0)
                mateIdx += 1;
            else
                mateIdx -= 1;
            
            std::stringstream mateName;
            mateName << "idx-" << mateIdx;
            SeqRecord mateRecord;
            mateRecord.id = mateName.str();
            mateRecord.seq = BWTAlgorithms::extractString(indices.pBWT, mateIdx);
            mateRecord.qual = indices.pQualityTable->getQualityString(mateIdx, mateRecord.seq.length());
            if(!record.seq.empty() && !mateRecord.seq.empty())
                pOutMates->push_back(mateRecord);
        }
    }
//.........这里部分代码省略.........
开发者ID:SherlockThang,项目名称:sga,代码行数:101,代码来源:HapgenUtil.cpp

示例8: PROFILE_FUNC

// Align the haplotype to the reference genome represented by the BWT/SSA pair
void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
        const std::string& haplotype,
        const BWTIndexSet& referenceIndex,
        const ReadTable* pReferenceTable,
        HapgenAlignmentVector& outAlignments)
{
    PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer")
    int64_t max_interval_size = 4;

    if(haplotype.size() < k)
        return;

    std::vector<int> event_count_vector;
    std::vector<HapgenAlignment> tmp_alignments;
    int min_events = std::numeric_limits<int>::max();

    // Align forward and reverse haplotype to reference
    for(size_t i = 0; i <= 1; ++i)
    {
        bool is_reverse = i == 1;
        std::string query = is_reverse ? reverseComplement(haplotype) : haplotype;

        // Find shared kmers between the haplotype and the reference
        CandidateVector candidates;

        size_t nqk = query.size() - k + 1;
        for(size_t j = 0; j < nqk; ++j)
        {
            std::string kmer = query.substr(j, k);

            // Find the interval of this kmer in the reference
            BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer);
            if(!interval.isValid() || interval.size() >= max_interval_size)
                continue; // not found or too repetitive

            // Extract the reference location of these hits
            for(int64_t k = interval.lower; k  <= interval.upper; ++k)
            {
                SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT);

                // Make a candidate alignment
                CandidateKmerAlignment candidate;
                candidate.query_index = j;
                candidate.target_index = elem.getPos();
                candidate.target_extrapolated_start = candidate.target_index - candidate.query_index;
                candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size();
                candidate.target_sequence_id = elem.getID();
                candidates.push_back(candidate);
            }
        }

        // Remove duplicate candidates
        std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart);
        CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart);
        candidates.resize(new_end - candidates.begin());

        for(size_t j = 0; j < candidates.size(); ++j)
        {
            // Extract window around reference
            size_t window_size = 200;
            int ref_start = candidates[j].target_extrapolated_start - window_size;
            int ref_end = candidates[j].target_extrapolated_end + window_size;

            const DNAString& ref_sequence = pReferenceTable->getRead(candidates[j].target_sequence_id).seq;
            if(ref_start < 0)
                ref_start = 0;

            if(ref_end > (int)ref_sequence.length())
                ref_end = ref_sequence.length();

            std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start);

            // Align haplotype to the reference
            SequenceOverlap overlap = Overlapper::computeOverlap(query, ref_substring);

            // Skip terrible alignments
            double percent_aligned = (double)overlap.getOverlapLength() / query.size();
            if(percent_aligned < 0.95f)
                continue;
            /*
            // Skip alignments that are not full-length matches of the haplotype
            if(overlap.match[0].start != 0 || overlap.match[0].end != (int)haplotype.size() - 1)
                continue;
            */
            int alignment_start = ref_start + overlap.match[1].start;
            int alignment_end = ref_start + overlap.match[1].end; // inclusive
            int alignment_length = alignment_end - alignment_start + 1;

            // Crude count of the number of distinct variation events
            int num_events = overlap.edit_distance;
            std::stringstream c_parser(overlap.cigar);
            int len;
            char t;
            while(c_parser >> len >> t)
            {
                assert(len > 0);

                // Only count one event per insertion/deletion
                if(t == 'D' || t == 'I')
//.........这里部分代码省略.........
开发者ID:nathanhaigh,项目名称:sga,代码行数:101,代码来源:HapgenUtil.cpp

示例9: assert

AlphaCount64 BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(const std::string str,
                                                                   const BWT* pBWT,
                                                                   EdgeDir direction,
                                                                   const BWTIntervalCache* pFwdCache)
{
    size_t k = str.size();
    size_t p = k - 1;

    std::string pmer;

    // In the sense direction, we extend from the 3' end
    if(direction == ED_SENSE)
        pmer = str.substr(1, p);
    else
        pmer = str.substr(0, p);
    assert(pmer.length() == p);
    std::string rc_pmer = reverseComplement(pmer);

    // As we only have a single index, we can only directly look up
    // the extensions for either the pmer or its reverse complement
    // In the SENSE extension direction, we directly look up for
    // the reverse complement. In ANTISENSE we directly look up for
    // the pmer.

    // Get the extension bases
    AlphaCount64 extensions;
    AlphaCount64 rc_extensions;

    // Set up pointers to the data to fill in/query
    // depending on the direction of the extension
    AlphaCount64* pDirectEC;
    AlphaCount64* pIndirectEC;
    std::string* pDirectStr;
    std::string* pIndirectStr;
    if(direction == ED_SENSE)
    {
        pDirectEC = &rc_extensions;
        pDirectStr = &rc_pmer;

        pIndirectEC = &extensions;
        pIndirectStr = &pmer;
    }
    else
    {
        pDirectEC = &extensions;
        pDirectStr = &pmer;

        pIndirectEC = &rc_extensions;
        pIndirectStr = &rc_pmer;
    }

    // Get the interval for the direct query string
    BWTInterval interval;

    // Use interval cache if available
    if(pFwdCache)
        interval = BWTAlgorithms::findIntervalWithCache(pBWT, pFwdCache, *pDirectStr);
    else
        interval = BWTAlgorithms::findInterval(pBWT, *pDirectStr);

    // Fill in the direct count
    if(interval.isValid())
        *pDirectEC = BWTAlgorithms::getExtCount(interval, pBWT);

    // Now, for the non-direct index, query the 4 possible k-mers that are adjacent to the pmer
    // Setup the query sequence
    std::string query(k, 'A');
    int varIdx = query.size() - 1;
    query.replace(0, p, *pIndirectStr);

    for(int i = 0; i < BWT_ALPHABET::size; ++i)
    {
        // Transform the query
        char b = BWT_ALPHABET::getChar(i);
        query[varIdx] = b;

        // Perform lookup
        if(pFwdCache)
            interval = BWTAlgorithms::findIntervalWithCache(pBWT, pFwdCache, query);
        else
            interval = BWTAlgorithms::findInterval(pBWT, query);

        // Update the extension count
        if(interval.isValid())
            pIndirectEC->add(b, interval.size());
    }

    // Switch the reverse-complement extensions to the same strand as the str
    rc_extensions.complement();
    extensions += rc_extensions;
    return extensions;
}
开发者ID:Nanapeel,项目名称:StriDe,代码行数:92,代码来源:BWTAlgorithms.cpp

示例10: exit

ClusterResult ClusterProcess::process(const SequenceWorkItem& item)
{
    // Calculate the intervals in the forward FM-index for this read
    const BWT* pBWT = m_pOverlapper->getBWT();

    // Check if this read is a substring
    OverlapBlockList tempBlockList;
    OverlapResult overlapResult = m_pOverlapper->alignReadDuplicate(item.read, &tempBlockList);
    if(overlapResult.isSubstring)
    {
        std::cerr << "Error: substring reads found in sga-cluster. Please run rmdup before cluster\n";
        exit(1);
    }

    // Find the interval in the fm-index containing the read
    std::string readString = item.read.seq.toString();
    BWTInterval readInterval = BWTAlgorithms::findInterval(pBWT, readString);
    BWTAlgorithms::updateInterval(readInterval, '$', pBWT);

    // The read must be present in the index
    assert(readInterval.isValid());

    // Check if this read has been used yet
    bool used = false;
    for(int64_t i = readInterval.lower; i <= readInterval.upper; ++i)
    {
        if(m_pMarkedReads->test(i))
        {
            used = true;
            break;
        }
    }

    ClusterResult result;
    if(used)
        return result; // already part of a cluster, return nothing

    // Compute a new cluster around this read
    std::set<int64_t> usedIndex;
    ClusterNodeQueue queue;
    ClusterNode node;
    node.sequence = item.read.seq.toString();
    node.interval = readInterval;
    node.isReverseInterval = false;
    usedIndex.insert(readInterval.lower);
    queue.push(node);
    while(!queue.empty())
    {
        ClusterNode node = queue.front();
        queue.pop();

        // Update the used index and the result structure with this node's data
        result.clusterNodes.push_back(node);

        SeqRecord tempRecord;
        tempRecord.id = "cluster";
        tempRecord.seq = node.sequence;
        OverlapBlockList blockList;
        OverlapResult result = m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList);
        //m_pOverlapper->buildForwardHistory(&blockList);
        
        // Parse each member of the block list and potentially expand the cluster
        for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter)
        {
            // Check if the reads in this block are part of the cluster already
            BWTInterval canonicalInterval = iter->getCanonicalInterval();
            int64_t canonicalIndex = canonicalInterval.lower;
            if(usedIndex.count(canonicalIndex) == 0)
            {
                usedIndex.insert(canonicalIndex);
                ClusterNode newNode;
                newNode.sequence = iter->getFullString(node.sequence);
                newNode.interval = canonicalInterval;
                newNode.isReverseInterval = iter->flags.isTargetRev();
                queue.push(newNode);
            }
        }
    }

    // If some work was performed, update the bitvector so other threads do not try to merge the same set of reads.
    // This uses compare-and-swap instructions to ensure the uppdate is atomic. 
    // If some other thread has merged this set (and updated
    // the bitvector), we discard all the merged data.
    
    // As a given set of reads should all be merged together, we only need to make sure we atomically update
    // the bit for the read with the lowest index in the set.

    // Sort the intervals into ascending order and remove any duplicate intervals (which can occur
    // if the subgraph has a simple cycle)
    std::sort(result.clusterNodes.begin(), result.clusterNodes.end(), ClusterNode::compare);
    std::vector<ClusterNode>::iterator newEnd = std::unique(result.clusterNodes.begin(),
                                                            result.clusterNodes.end(),
                                                            ClusterNode::equal);

    size_t oldSize = result.clusterNodes.size();
    result.clusterNodes.erase(newEnd, result.clusterNodes.end());
    size_t newSize = result.clusterNodes.size();
    if(oldSize != newSize)
        std::cout << "Warning: duplicate cluster nodes were found\n";

//.........这里部分代码省略.........
开发者ID:avilella,项目名称:sga,代码行数:101,代码来源:ClusterProcess.cpp

示例11: PROFILE_FUNC

GraphCompareResult GraphCompare::process(const SequenceWorkItem& item) const
{
    PROFILE_FUNC("GraphCompare::process")
    GraphCompareResult result;
    SeqRecord currRead = item.read;
    std::string w = item.read.seq.toString();
    int len = w.size();
    int num_kmers = len - m_parameters.kmer + 1;

    // Check if the read is long enough to be used
    if(w.size() < m_parameters.kmer)
        return result;

    // Process the kmers that have not been previously visited
    // We only try to find variants from one k-mer per read. 
    // This heurestic handles the situation where we process a kmer,
    // spend a lot of time trying to assemble it into a string and fail,
    // then try again with the next k-mer in the read. Since all the k-mers
    // in a read are connected in the de Bruijn graph, if the first attempt
    // to assemble a k-mer into a variant fails, it is very unlikely that
    // subsequent attempts will succeed.
    bool variantAttempted = false;
    for(int j = 0; j < num_kmers; ++j)
    {
        std::string kmer = w.substr(j, m_parameters.kmer);
        std::string rc_kmer = reverseComplement(kmer);

        // Use the lexicographically lower of the kmer and its pair as the key in the bloom filter
        std::string& key_kmer = kmer < rc_kmer ? kmer : rc_kmer;

        // Check if this k-mer is marked as used by the bloom filter
        if(m_parameters.pBloomFilter->test(key_kmer.c_str(), key_kmer.size()))
            continue;
        
    	// Get the interval for this kmer
        BWTInterval interval = BWTAlgorithms::findInterval(m_parameters.variantIndex, kmer);
        BWTInterval rc_interval = BWTAlgorithms::findInterval(m_parameters.variantIndex, rc_kmer);
        
        size_t count = interval.size();
        if(rc_interval.isValid())
            count += rc_interval.size();

        bool both_strands = interval.size() > 0 && rc_interval.size() > 0;

        size_t min_base_coverage = 1;
        if(count >= m_parameters.minDiscoveryCount && count < m_parameters.maxDiscoveryCount && !variantAttempted && both_strands)
        {
            // Update the bloom filter to contain this kmer
            m_parameters.pBloomFilter->add(key_kmer.c_str(), key_kmer.size());

            // Check if this k-mer is present in the other base index
            size_t base_count = BWTAlgorithms::countSequenceOccurrences(kmer, m_parameters.baseIndex);
            
            if(Verbosity::Instance().getPrintLevel() > 6)
                std::cout << "Read: " << currRead.id << " k: " << j << " CV: " << interval.size() << "/" << rc_interval.size() << " " << base_count << "\n";
            
            // k-mer present in the base read set, skip it
            if(base_count >= min_base_coverage)
                continue;
            
            if(Verbosity::Instance().getPrintLevel() > 0)
                std::cout << "Variant read: " << w << "\n";

            // variant k-mer, attempt to assemble it into haplotypes
            GraphBuildResult build_result = processVariantKmer(kmer, count);
            variantAttempted = true;

            // Mark the kmers of the variant haplotypes as being visited
            for(size_t vhi = 0; vhi < build_result.variant_haplotypes.size(); ++vhi)
                markVariantSequenceKmers(build_result.variant_haplotypes[vhi]);
            
            // If we assembled anything, run Dindel on the haplotypes
            if(build_result.variant_haplotypes.size() > 0)
            {
                if(Verbosity::Instance().getPrintLevel() > 0)
                    std::cout << "Running dindel\n";

                std::stringstream baseVCFSS;
                std::stringstream variantVCFSS;
                std::stringstream callsVCFSS;
                DindelReadReferenceAlignmentVector alignments;

                DindelReturnCode drc = DindelUtil::runDindelPairMatePair(kmer,
                                                                         build_result.base_haplotypes,
                                                                         build_result.variant_haplotypes,
                                                                         m_parameters,
                                                                         baseVCFSS,
                                                                         variantVCFSS,
                                                                         callsVCFSS,
                                                                         &alignments);
                
                //
                if(Verbosity::Instance().getPrintLevel() > 0)
                {
                    std::cout << "Dindel returned " << drc << "\n";
                    std::cout << "base vcf records:\n" << baseVCFSS.str() << "\n";
                    std::cout << "variant vcf records:\n" << variantVCFSS.str() << "\n";
                }
                
                // DINDEL ran without error, push its results to the output
//.........这里部分代码省略.........
开发者ID:Bioinformaticsnl,项目名称:sga,代码行数:101,代码来源:GraphCompare.cpp

示例12: visitedKmers

MetAssembleResult MetAssemble::process(const SequenceWorkItem& item)
{
    MetAssembleResult result;
    SeqRecord currRead = item.read;
    std::string w = item.read.seq.toString();
    if(w.size() <= m_parameters.kmer)
    {
        return result;
    }

    // Perform a backwards search using the read sequence
    // Check which k-mers have already been visited using the
    // shared bitvector. If any bit in the range [l,u] is set
    // for a suffix of the read, then we do not visit those kmers
    // later
    int len = w.size();
    int num_kmers = len - m_parameters.kmer + 1;
    std::vector<bool> visitedKmers(false, num_kmers);

    int j = len - 1;
    char curr = w[j];
    BWTInterval interval;
    BWTAlgorithms::initInterval(interval, curr, m_parameters.pBWT);
    --j;

    for(;j >= 0; --j)
    {
        curr = w[j];
        BWTAlgorithms::updateInterval(interval, curr, m_parameters.pBWT);
        assert(interval.isValid());

        // At this point interval represents the suffix [j,len)
        // Check if the starting point of this interval is set
        if(j < num_kmers)
            visitedKmers[j] = m_parameters.pBitVector->test(interval.lower);
    }
    
    // Process the kmers that have not been previously visited
    for(j = 0; j < num_kmers; ++j)
    {
        if(visitedKmers[j])
            continue; // skip

        std::string kmer = w.substr(j, m_parameters.kmer);
        
        // Get the interval for this kmer
        BWTInterval interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, kmer);

        // Check if this interval has been marked by a previous iteration of the loop
        assert(interval.isValid());
        if(m_parameters.pBitVector->test(interval.lower))
            continue;

        BWTInterval rc_interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, reverseComplement(kmer));

        size_t count = interval.size();
        if(rc_interval.isValid())
            count += rc_interval.size();

        if(count >= m_parameters.kmerThreshold)
        {
            // Process the kmer
            std::string contig = processKmer(kmer, count);

            // We must determine if this contig has been assembled by another thread.
            // Break the contig into lexicographically ordered set of kmers. The lowest kmer is chosen
            // to represent the contig. If this kmer has been marked as visited, we discard the contig
            // otherwise we mark all kmers in the contig and output the contig.
            StringVector kmers = getLexicographicKmers(contig);

            // Get the lowest kmer in the set
            assert(!kmers.empty());
            std::string lowest = kmers.front();
            BWTInterval lowInterval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, lowest);
            BWTInterval lowRCInterval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, reverseComplement(lowest));
            bool marked = false;

            // If the kmer exists in the read set (the interval is valid), we attempt to mark it
            // Otherwise, we attempt to mark its reverse complement. If either call succeeds
            // we output the contig. This block of code gives the synchronization between threads.
            // If multiple threads attempt to assemble the same contig, marked will be true for only
            // one of the threads.
            if(lowInterval.isValid())
                marked = m_parameters.pBitVector->updateCAS(lowInterval.lower, false, true);
            else
                marked = m_parameters.pBitVector->updateCAS(lowRCInterval.lower, false, true);

            // Mark all the kmers in the contig so they will not be visited again
            markSequenceKmers(contig);

            // If the collision check passed, output the contig
            if(marked)
            {
                if(contig.size() >= m_parameters.minLength)
                    result.contigs.push_back(contig);
            }
        }

        // Update the bit vector for the source kmer
        for(int64_t i = interval.lower; i <= interval.upper; ++i)
//.........这里部分代码省略.........
开发者ID:Buttonwood,项目名称:sga,代码行数:101,代码来源:MetAssembleProcess.cpp

示例13: PROFILE_FUNC

SequenceOverlapPairVector KmerOverlaps::PacBioRetrieveMatches(const std::string& query, size_t k, 
                                                        int min_overlap, double min_identity,
                                                        int bandwidth, const BWTIndexSet& indices,
														KmerDistribution& kd, int round)
{
    PROFILE_FUNC("OverlapHaplotypeBuilder::PacBioRetrieveMatches")
    assert(indices.pBWT != NULL);
    assert(indices.pSSA != NULL);
	
	//size_t numStringCount[query.size()+1] = 0;
	
	int64_t intervalSum = 0;
    static size_t n_calls = 0;
    static size_t n_candidates = 0;
    static size_t n_output = 0;
    static double t_time = 0;
	size_t count = 0;
	size_t numKmer = 0;
	size_t numRepeatKmer = 0;
	size_t totalKmer = 0;
	size_t numNoSeedRead = 0;
	size_t repeatCutoff = kd.getRepeatKmerCutoff();
	size_t errorCutoff = kd.getMedian() - kd.getSdv();
    Timer timer("test", true);
	
    n_calls++;
	//std::cout<<"PacBioRetrieveMatches\n";
	std::cout<<"\tk :\t"<<k<<"\n";
    SequenceOverlapPairVector overlap_vector;
	std::vector<long> identityVector(100);
	for(int j = 0;j < identityVector.size(); j++)
		identityVector[j] = 0;
		
    // Use the FM-index to look up intervals for each kmer of the read. Each index
    // in the interval is stored individually in the KmerMatchMap. We then
    // backtrack to map these kmer indices to read IDs. As reads can share
    // multiple kmers, we use the map to avoid redundant lookups.
    // There is likely a faster algorithm which performs direct decompression
    // of the read sequences without having to expand the intervals to individual
    // indices. The current algorithm suffices for now.
    KmerMatchMap prematchMap;
    size_t num_kmers = query.size() - k + 1;
	clock_t search_seeds_s = clock(), search_seeds_e;
    
	
	for(size_t i = 0; i < num_kmers; i++)
    {
        std::string kmer = query.substr(i, k);
        BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer);
		if(interval.upper - interval.lower < errorCutoff)
			numNoSeedRead++;
		if((interval.upper - interval.lower) > 20 && (interval.upper - interval.lower) < repeatCutoff)
		{
			numKmer++;
			totalKmer++;
		}
		
		//To avoid the repeat region
		/*if((interval.upper - interval.lower) > repeatCutoff)
		{	
			numRepeatKmer++;
			totalKmer++;
			continue;
		}
		else
			interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper;*/
		
        if(interval.isValid() && interval.size()) 
        {
			//std::cout<<"\tinterval size : "<<interval.upper - interval.lower<<std::endl;
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), false };
                prematchMap.insert(std::make_pair(match, false));
            }
			intervalSum += interval.upper - interval.lower;
			count++;
        }

        kmer = reverseComplement(kmer);
        interval = BWTAlgorithms::findInterval(indices, kmer);
		interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper;
        if(interval.isValid() && interval.size())
        {
            for(int64_t j = interval.lower; j <= interval.upper; ++j)
            {
                KmerMatch match = { i, static_cast<size_t>(j), true };
                prematchMap.insert(std::make_pair(match, false));
            }
			intervalSum += interval.upper - interval.lower;
			count++;
        }
    }
	if(numNoSeedRead == num_kmers)
		std::cout<<"\tnoSeedRead : 1"<<std::endl;
	std::cout<<"\tnumber of kmer : "<<numKmer<<std::endl;
	std::cout<<"\tnumber of RepeatKmer : "<<numRepeatKmer<<std::endl;
	std::cout<<"\tnumber of totalkmer : "<<totalKmer<<std::endl;
	
	
//.........这里部分代码省略.........
开发者ID:sam789123,项目名称:SGA-Overlap-PacBio-Correction,代码行数:101,代码来源:KmerOverlaps.cpp


注:本文中的BWTInterval类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。