本文整理汇总了C++中BWTInterval::isValid方法的典型用法代码示例。如果您正苦于以下问题:C++ BWTInterval::isValid方法的具体用法?C++ BWTInterval::isValid怎么用?C++ BWTInterval::isValid使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类BWTInterval
的用法示例。
在下文中一共展示了BWTInterval::isValid方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: markSequenceKmers
// Update the bit vector with the kmers that were assembled into str
void MetAssemble::markSequenceKmers(const std::string& str)
{
assert(str.size() >= m_parameters.kmer);
size_t n = str.size() - m_parameters.kmer + 1;
for(size_t i = 0; i < n; ++i)
{
std::string kseq = str.substr(i, m_parameters.kmer);
BWTInterval interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, kseq);
if(interval.isValid())
{
for(int64_t j = interval.lower; j <= interval.upper; ++j)
m_parameters.pBitVector->updateCAS(j, false, true);
}
// Mark the reverse complement k-mers too
std::string rc_kseq = reverseComplement(kseq);
interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, rc_kseq);
if(interval.isValid())
{
for(int64_t j = interval.lower; j <= interval.upper; ++j)
m_parameters.pBitVector->updateCAS(j, false, true);
}
}
}
示例2: findIntervalWithCache
// Find the interval in pBWT corresponding to w
// using a cache of short k-mer intervals to avoid
// some of the iterations
BWTInterval BWTAlgorithms::findIntervalWithCache(const BWT* pBWT, const BWTIntervalCache* pIntervalCache, const std::string& w)
{
size_t cacheLen = pIntervalCache->getCachedLength();
if(w.size() < cacheLen)
return findInterval(pBWT, w);
// Compute the interval using the cache for the last k bases
int len = w.size();
int j = len - cacheLen;
// Check whether the input string has a '$' in it.
// We don't cache these strings so if it does
// we have to do a direct lookup
if(index(w.c_str() + j, '$') != NULL)
return findInterval(pBWT, w);
BWTInterval interval = pIntervalCache->lookup(w.c_str() + j);
j -= 1;
for(;j >= 0; --j)
{
char curr = w[j];
updateInterval(interval, curr, pBWT);
if(!interval.isValid())
return interval;
}
return interval;
}
示例3: countSequenceOccurrencesSingleStrand
size_t BWTAlgorithms::countSequenceOccurrencesSingleStrand(const std::string& w, const BWTIndexSet& indices)
{
assert(indices.pBWT != NULL);
assert(indices.pCache != NULL);
BWTInterval interval = findIntervalWithCache(indices.pBWT, indices.pCache, w);
return interval.isValid() ? interval.size() : 0;
}
示例4: addSeed
// Add a seed read to the cluster. Overlaps will be found for
// each seed read to grow the cluster
ClusterNode ReadCluster::addSeed(const std::string& sequence, bool bCheckInIndex)
{
// Check if this read is a substring
SeqRecord tempRecord;
tempRecord.id = "cluster-seed";
tempRecord.seq = sequence;
OverlapBlockList tempBlockList;
OverlapResult overlapResult = m_pOverlapper->alignReadDuplicate(tempRecord, &tempBlockList);
if(overlapResult.isSubstring)
{
// If bCheckInIndex is true, then we are extending clusters from reads that are in the FM-index
// In this case, seeding a substring read is an error and we abort. If the seed is NOT in the index
// we just emit a warning and ignore the seed
if(bCheckInIndex)
{
std::cerr << "Error: The seed used for sga-cluster-extend is a substring of some read.\n";
std::cerr << "Please run sga rmdup before cluster\n";
std::cerr << "Sequence: " << sequence << "\n";
exit(1);
}
else
{
std::cerr << "Warning: The cluster sequence to extend is a substring of some read. This seed is being skipped\n";
std::cerr << "Sequence: " << sequence << "\n";
ClusterNode emptyNode;
// Set an invalid interval
emptyNode.interval.lower = 0;
emptyNode.interval.upper = -1;
return emptyNode;
}
}
// Find the interval in the fm-index containing the read
const BWT* pBWT = m_pOverlapper->getBWT();
BWTInterval readInterval = BWTAlgorithms::findInterval(pBWT, sequence);
BWTAlgorithms::updateInterval(readInterval, '$', pBWT);
// When building primary clusters, we require each read to be in the index.
if(bCheckInIndex)
{
if(!readInterval.isValid())
{
std::cerr << "sga cluster error: The seed read is not part of the FM-index.\n";
exit(EXIT_FAILURE);
}
}
ClusterNode node;
node.sequence = sequence;
node.interval = readInterval;
node.isReverseInterval = false;
m_usedIndex.insert(readInterval.lower);
m_queue.push(node);
return node;
}
示例5: findInterval
// Find the interval in pBWT corresponding to w
// If w does not exist in the BWT, the interval
// coordinates [l, u] will be such that l > u
BWTInterval BWTAlgorithms::findInterval(const BWT* pBWT, const std::string& w)
{
int len = w.size();
int j = len - 1;
char curr = w[j];
BWTInterval interval;
initInterval(interval, curr, pBWT);
--j;
for(;j >= 0; --j)
{
curr = w[j];
updateInterval(interval, curr, pBWT);
if(!interval.isValid())
return interval;
}
return interval;
}
示例6: retrieveMatches
SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query, size_t k,
int min_overlap, double min_identity,
int bandwidth, const BWTIndexSet& indices)
{
PROFILE_FUNC("OverlapHaplotypeBuilder::retrieveMatches")
assert(indices.pBWT != NULL);
assert(indices.pSSA != NULL);
int64_t max_interval_size = 200;
SequenceOverlapPairVector overlap_vector;
// Use the FM-index to look up intervals for each kmer of the read. Each index
// in the interval is stored individually in the KmerMatchMap. We then
// backtrack to map these kmer indices to read IDs. As reads can share
// multiple kmers, we use the map to avoid redundant lookups.
// There is likely a faster algorithm which performs direct decompression
// of the read sequences without having to expand the intervals to individual
// indices. The current algorithm suffices for now.
KmerMatchMap prematchMap;
size_t num_kmers = query.size() - k + 1;
for(size_t i = 0; i < num_kmers; ++i)
{
std::string kmer = query.substr(i, k);
BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer);
if(interval.isValid() && interval.size() < max_interval_size)
{
for(int64_t j = interval.lower; j <= interval.upper; ++j)
{
KmerMatch match = { i, static_cast<size_t>(j), false };
prematchMap.insert(std::make_pair(match, false));
}
}
kmer = reverseComplement(kmer);
interval = BWTAlgorithms::findInterval(indices, kmer);
if(interval.isValid() && interval.size() < max_interval_size)
{
for(int64_t j = interval.lower; j <= interval.upper; ++j)
{
KmerMatch match = { i, static_cast<size_t>(j), true };
prematchMap.insert(std::make_pair(match, false));
}
}
}
// Backtrack through the kmer indices to turn them into read indices.
// This mirrors the calcSA function in SampledSuffixArray except we mark each entry
// as visited once it is processed.
KmerMatchSet matches;
for(KmerMatchMap::iterator iter = prematchMap.begin(); iter != prematchMap.end(); ++iter)
{
// This index has been visited
if(iter->second)
continue;
// Mark this as visited
iter->second = true;
// Backtrack the index until we hit the starting symbol
KmerMatch out_match = iter->first;
while(1)
{
char b = indices.pBWT->getChar(out_match.index);
out_match.index = indices.pBWT->getPC(b) + indices.pBWT->getOcc(b, out_match.index - 1);
// Check if the hash indicates we have visited this index. If so, stop the backtrack
KmerMatchMap::iterator find_iter = prematchMap.find(out_match);
if(find_iter != prematchMap.end())
{
// We have processed this index already
if(find_iter->second)
break;
else
find_iter->second = true;
}
if(b == '$')
{
// We've found the lexicographic index for this read. Turn it into a proper ID
out_match.index = indices.pSSA->lookupLexoRank(out_match.index);
matches.insert(out_match);
break;
}
}
}
// Refine the matches by computing proper overlaps between the sequences
// Use the overlaps that meet the thresholds to build a multiple alignment
for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter)
{
std::string match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);
if(iter->is_reverse)
match_sequence = reverseComplement(match_sequence);
// Ignore identical matches
if(match_sequence == query)
continue;
// Compute the overlap. If the kmer match occurs a single time in each sequence we use
// the banded extension overlap strategy. Otherwise we use the slow O(M*N) overlapper.
//.........这里部分代码省略.........
示例7: alignHaplotypeToReferenceKmer
// Align the haplotype to the reference genome represented by the BWT/SSA pair
void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
const std::string& haplotype,
const BWTIndexSet& referenceIndex,
const ReadTable* pReferenceTable,
HapgenAlignmentVector& outAlignments)
{
PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer")
int64_t max_interval_size = 4;
if(haplotype.size() < k)
return;
std::vector<int> event_count_vector;
std::vector<HapgenAlignment> tmp_alignments;
int min_events = std::numeric_limits<int>::max();
// Align forward and reverse haplotype to reference
for(size_t i = 0; i <= 1; ++i)
{
bool is_reverse = i == 1;
std::string query = is_reverse ? reverseComplement(haplotype) : haplotype;
// Find shared kmers between the haplotype and the reference
CandidateVector candidates;
size_t nqk = query.size() - k + 1;
for(size_t j = 0; j < nqk; ++j)
{
std::string kmer = query.substr(j, k);
// Find the interval of this kmer in the reference
BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer);
if(!interval.isValid() || interval.size() >= max_interval_size)
continue; // not found or too repetitive
// Extract the reference location of these hits
for(int64_t k = interval.lower; k <= interval.upper; ++k)
{
SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT);
// Make a candidate alignment
CandidateKmerAlignment candidate;
candidate.query_index = j;
candidate.target_index = elem.getPos();
candidate.target_extrapolated_start = candidate.target_index - candidate.query_index;
candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size();
candidate.target_sequence_id = elem.getID();
candidates.push_back(candidate);
}
}
// Remove duplicate candidates
std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart);
CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart);
candidates.resize(new_end - candidates.begin());
for(size_t j = 0; j < candidates.size(); ++j)
{
// Extract window around reference
size_t window_size = 200;
int ref_start = candidates[j].target_extrapolated_start - window_size;
int ref_end = candidates[j].target_extrapolated_end + window_size;
const SeqItem& ref_record = pReferenceTable->getRead(candidates[j].target_sequence_id);
const DNAString& ref_sequence = ref_record.seq;
if(ref_start < 0)
ref_start = 0;
if(ref_end > (int)ref_sequence.length())
ref_end = ref_sequence.length();
std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start);
// Align haplotype to the reference
SequenceOverlap overlap = alignHaplotypeToReference(ref_substring, query);
if(overlap.score < 0 || !overlap.isValid())
continue;
int alignment_start = ref_start + overlap.match[0].start;
int alignment_end = ref_start + overlap.match[0].end; // inclusive
int alignment_length = alignment_end - alignment_start + 1;
// Crude count of the number of distinct variation events
bool has_indel = false;
int num_events = overlap.edit_distance;
std::stringstream c_parser(overlap.cigar);
int len;
char t;
while(c_parser >> len >> t)
{
assert(len > 0);
// Only count one event per insertion/deletion
if(t == 'D' || t == 'I')
{
num_events -= (len - 1);
has_indel = true;
}
}
//.........这里部分代码省略.........
示例8: alignHaplotypeToReferenceKmer
// Align the haplotype to the reference genome represented by the BWT/SSA pair
void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
const std::string& haplotype,
const BWTIndexSet& referenceIndex,
const ReadTable* pReferenceTable,
HapgenAlignmentVector& outAlignments)
{
PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer")
int64_t max_interval_size = 4;
if(haplotype.size() < k)
return;
std::vector<int> event_count_vector;
std::vector<HapgenAlignment> tmp_alignments;
int min_events = std::numeric_limits<int>::max();
// Align forward and reverse haplotype to reference
for(size_t i = 0; i <= 1; ++i)
{
bool is_reverse = i == 1;
std::string query = is_reverse ? reverseComplement(haplotype) : haplotype;
// Find shared kmers between the haplotype and the reference
CandidateVector candidates;
size_t nqk = query.size() - k + 1;
for(size_t j = 0; j < nqk; ++j)
{
std::string kmer = query.substr(j, k);
// Find the interval of this kmer in the reference
BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer);
if(!interval.isValid() || interval.size() >= max_interval_size)
continue; // not found or too repetitive
// Extract the reference location of these hits
for(int64_t k = interval.lower; k <= interval.upper; ++k)
{
SAElem elem = referenceIndex.pSSA->calcSA(k, referenceIndex.pBWT);
// Make a candidate alignment
CandidateKmerAlignment candidate;
candidate.query_index = j;
candidate.target_index = elem.getPos();
candidate.target_extrapolated_start = candidate.target_index - candidate.query_index;
candidate.target_extrapolated_end = candidate.target_extrapolated_start + query.size();
candidate.target_sequence_id = elem.getID();
candidates.push_back(candidate);
}
}
// Remove duplicate candidates
std::sort(candidates.begin(), candidates.end(), CandidateKmerAlignment::sortByStart);
CandidateVector::iterator new_end = std::unique(candidates.begin(), candidates.end(), CandidateKmerAlignment::equalByStart);
candidates.resize(new_end - candidates.begin());
for(size_t j = 0; j < candidates.size(); ++j)
{
// Extract window around reference
size_t window_size = 200;
int ref_start = candidates[j].target_extrapolated_start - window_size;
int ref_end = candidates[j].target_extrapolated_end + window_size;
const DNAString& ref_sequence = pReferenceTable->getRead(candidates[j].target_sequence_id).seq;
if(ref_start < 0)
ref_start = 0;
if(ref_end > (int)ref_sequence.length())
ref_end = ref_sequence.length();
std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start);
// Align haplotype to the reference
SequenceOverlap overlap = Overlapper::computeOverlap(query, ref_substring);
// Skip terrible alignments
double percent_aligned = (double)overlap.getOverlapLength() / query.size();
if(percent_aligned < 0.95f)
continue;
/*
// Skip alignments that are not full-length matches of the haplotype
if(overlap.match[0].start != 0 || overlap.match[0].end != (int)haplotype.size() - 1)
continue;
*/
int alignment_start = ref_start + overlap.match[1].start;
int alignment_end = ref_start + overlap.match[1].end; // inclusive
int alignment_length = alignment_end - alignment_start + 1;
// Crude count of the number of distinct variation events
int num_events = overlap.edit_distance;
std::stringstream c_parser(overlap.cigar);
int len;
char t;
while(c_parser >> len >> t)
{
assert(len > 0);
// Only count one event per insertion/deletion
if(t == 'D' || t == 'I')
//.........这里部分代码省略.........
示例9: calculateDeBruijnExtensionsSingleIndex
AlphaCount64 BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(const std::string str,
const BWT* pBWT,
EdgeDir direction,
const BWTIntervalCache* pFwdCache)
{
size_t k = str.size();
size_t p = k - 1;
std::string pmer;
// In the sense direction, we extend from the 3' end
if(direction == ED_SENSE)
pmer = str.substr(1, p);
else
pmer = str.substr(0, p);
assert(pmer.length() == p);
std::string rc_pmer = reverseComplement(pmer);
// As we only have a single index, we can only directly look up
// the extensions for either the pmer or its reverse complement
// In the SENSE extension direction, we directly look up for
// the reverse complement. In ANTISENSE we directly look up for
// the pmer.
// Get the extension bases
AlphaCount64 extensions;
AlphaCount64 rc_extensions;
// Set up pointers to the data to fill in/query
// depending on the direction of the extension
AlphaCount64* pDirectEC;
AlphaCount64* pIndirectEC;
std::string* pDirectStr;
std::string* pIndirectStr;
if(direction == ED_SENSE)
{
pDirectEC = &rc_extensions;
pDirectStr = &rc_pmer;
pIndirectEC = &extensions;
pIndirectStr = &pmer;
}
else
{
pDirectEC = &extensions;
pDirectStr = &pmer;
pIndirectEC = &rc_extensions;
pIndirectStr = &rc_pmer;
}
// Get the interval for the direct query string
BWTInterval interval;
// Use interval cache if available
if(pFwdCache)
interval = BWTAlgorithms::findIntervalWithCache(pBWT, pFwdCache, *pDirectStr);
else
interval = BWTAlgorithms::findInterval(pBWT, *pDirectStr);
// Fill in the direct count
if(interval.isValid())
*pDirectEC = BWTAlgorithms::getExtCount(interval, pBWT);
// Now, for the non-direct index, query the 4 possible k-mers that are adjacent to the pmer
// Setup the query sequence
std::string query(k, 'A');
int varIdx = query.size() - 1;
query.replace(0, p, *pIndirectStr);
for(int i = 0; i < BWT_ALPHABET::size; ++i)
{
// Transform the query
char b = BWT_ALPHABET::getChar(i);
query[varIdx] = b;
// Perform lookup
if(pFwdCache)
interval = BWTAlgorithms::findIntervalWithCache(pBWT, pFwdCache, query);
else
interval = BWTAlgorithms::findInterval(pBWT, query);
// Update the extension count
if(interval.isValid())
pIndirectEC->add(b, interval.size());
}
// Switch the reverse-complement extensions to the same strand as the str
rc_extensions.complement();
extensions += rc_extensions;
return extensions;
}
示例10: countSequenceOccurrencesSingleStrand
size_t BWTAlgorithms::countSequenceOccurrencesSingleStrand(const std::string& w, const BWT* pBWT)
{
BWTInterval interval = findInterval(pBWT, w);
return interval.isValid() ? interval.size() : 0;
}
示例11: process
ClusterResult ClusterProcess::process(const SequenceWorkItem& item)
{
// Calculate the intervals in the forward FM-index for this read
const BWT* pBWT = m_pOverlapper->getBWT();
// Check if this read is a substring
OverlapBlockList tempBlockList;
OverlapResult overlapResult = m_pOverlapper->alignReadDuplicate(item.read, &tempBlockList);
if(overlapResult.isSubstring)
{
std::cerr << "Error: substring reads found in sga-cluster. Please run rmdup before cluster\n";
exit(1);
}
// Find the interval in the fm-index containing the read
std::string readString = item.read.seq.toString();
BWTInterval readInterval = BWTAlgorithms::findInterval(pBWT, readString);
BWTAlgorithms::updateInterval(readInterval, '$', pBWT);
// The read must be present in the index
assert(readInterval.isValid());
// Check if this read has been used yet
bool used = false;
for(int64_t i = readInterval.lower; i <= readInterval.upper; ++i)
{
if(m_pMarkedReads->test(i))
{
used = true;
break;
}
}
ClusterResult result;
if(used)
return result; // already part of a cluster, return nothing
// Compute a new cluster around this read
std::set<int64_t> usedIndex;
ClusterNodeQueue queue;
ClusterNode node;
node.sequence = item.read.seq.toString();
node.interval = readInterval;
node.isReverseInterval = false;
usedIndex.insert(readInterval.lower);
queue.push(node);
while(!queue.empty())
{
ClusterNode node = queue.front();
queue.pop();
// Update the used index and the result structure with this node's data
result.clusterNodes.push_back(node);
SeqRecord tempRecord;
tempRecord.id = "cluster";
tempRecord.seq = node.sequence;
OverlapBlockList blockList;
OverlapResult result = m_pOverlapper->overlapRead(tempRecord, m_minOverlap, &blockList);
//m_pOverlapper->buildForwardHistory(&blockList);
// Parse each member of the block list and potentially expand the cluster
for(OverlapBlockList::const_iterator iter = blockList.begin(); iter != blockList.end(); ++iter)
{
// Check if the reads in this block are part of the cluster already
BWTInterval canonicalInterval = iter->getCanonicalInterval();
int64_t canonicalIndex = canonicalInterval.lower;
if(usedIndex.count(canonicalIndex) == 0)
{
usedIndex.insert(canonicalIndex);
ClusterNode newNode;
newNode.sequence = iter->getFullString(node.sequence);
newNode.interval = canonicalInterval;
newNode.isReverseInterval = iter->flags.isTargetRev();
queue.push(newNode);
}
}
}
// If some work was performed, update the bitvector so other threads do not try to merge the same set of reads.
// This uses compare-and-swap instructions to ensure the uppdate is atomic.
// If some other thread has merged this set (and updated
// the bitvector), we discard all the merged data.
// As a given set of reads should all be merged together, we only need to make sure we atomically update
// the bit for the read with the lowest index in the set.
// Sort the intervals into ascending order and remove any duplicate intervals (which can occur
// if the subgraph has a simple cycle)
std::sort(result.clusterNodes.begin(), result.clusterNodes.end(), ClusterNode::compare);
std::vector<ClusterNode>::iterator newEnd = std::unique(result.clusterNodes.begin(),
result.clusterNodes.end(),
ClusterNode::equal);
size_t oldSize = result.clusterNodes.size();
result.clusterNodes.erase(newEnd, result.clusterNodes.end());
size_t newSize = result.clusterNodes.size();
if(oldSize != newSize)
std::cout << "Warning: duplicate cluster nodes were found\n";
//.........这里部分代码省略.........
示例12: process
MetAssembleResult MetAssemble::process(const SequenceWorkItem& item)
{
MetAssembleResult result;
SeqRecord currRead = item.read;
std::string w = item.read.seq.toString();
if(w.size() <= m_parameters.kmer)
{
return result;
}
// Perform a backwards search using the read sequence
// Check which k-mers have already been visited using the
// shared bitvector. If any bit in the range [l,u] is set
// for a suffix of the read, then we do not visit those kmers
// later
int len = w.size();
int num_kmers = len - m_parameters.kmer + 1;
std::vector<bool> visitedKmers(false, num_kmers);
int j = len - 1;
char curr = w[j];
BWTInterval interval;
BWTAlgorithms::initInterval(interval, curr, m_parameters.pBWT);
--j;
for(;j >= 0; --j)
{
curr = w[j];
BWTAlgorithms::updateInterval(interval, curr, m_parameters.pBWT);
assert(interval.isValid());
// At this point interval represents the suffix [j,len)
// Check if the starting point of this interval is set
if(j < num_kmers)
visitedKmers[j] = m_parameters.pBitVector->test(interval.lower);
}
// Process the kmers that have not been previously visited
for(j = 0; j < num_kmers; ++j)
{
if(visitedKmers[j])
continue; // skip
std::string kmer = w.substr(j, m_parameters.kmer);
// Get the interval for this kmer
BWTInterval interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, kmer);
// Check if this interval has been marked by a previous iteration of the loop
assert(interval.isValid());
if(m_parameters.pBitVector->test(interval.lower))
continue;
BWTInterval rc_interval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, reverseComplement(kmer));
size_t count = interval.size();
if(rc_interval.isValid())
count += rc_interval.size();
if(count >= m_parameters.kmerThreshold)
{
// Process the kmer
std::string contig = processKmer(kmer, count);
// We must determine if this contig has been assembled by another thread.
// Break the contig into lexicographically ordered set of kmers. The lowest kmer is chosen
// to represent the contig. If this kmer has been marked as visited, we discard the contig
// otherwise we mark all kmers in the contig and output the contig.
StringVector kmers = getLexicographicKmers(contig);
// Get the lowest kmer in the set
assert(!kmers.empty());
std::string lowest = kmers.front();
BWTInterval lowInterval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, lowest);
BWTInterval lowRCInterval = BWTAlgorithms::findIntervalWithCache(m_parameters.pBWT, m_parameters.pBWTCache, reverseComplement(lowest));
bool marked = false;
// If the kmer exists in the read set (the interval is valid), we attempt to mark it
// Otherwise, we attempt to mark its reverse complement. If either call succeeds
// we output the contig. This block of code gives the synchronization between threads.
// If multiple threads attempt to assemble the same contig, marked will be true for only
// one of the threads.
if(lowInterval.isValid())
marked = m_parameters.pBitVector->updateCAS(lowInterval.lower, false, true);
else
marked = m_parameters.pBitVector->updateCAS(lowRCInterval.lower, false, true);
// Mark all the kmers in the contig so they will not be visited again
markSequenceKmers(contig);
// If the collision check passed, output the contig
if(marked)
{
if(contig.size() >= m_parameters.minLength)
result.contigs.push_back(contig);
}
}
// Update the bit vector for the source kmer
for(int64_t i = interval.lower; i <= interval.upper; ++i)
//.........这里部分代码省略.........
示例13: PacBioRetrieveMatches
SequenceOverlapPairVector KmerOverlaps::PacBioRetrieveMatches(const std::string& query, size_t k,
int min_overlap, double min_identity,
int bandwidth, const BWTIndexSet& indices,
KmerDistribution& kd, int round)
{
PROFILE_FUNC("OverlapHaplotypeBuilder::PacBioRetrieveMatches")
assert(indices.pBWT != NULL);
assert(indices.pSSA != NULL);
//size_t numStringCount[query.size()+1] = 0;
int64_t intervalSum = 0;
static size_t n_calls = 0;
static size_t n_candidates = 0;
static size_t n_output = 0;
static double t_time = 0;
size_t count = 0;
size_t numKmer = 0;
size_t numRepeatKmer = 0;
size_t totalKmer = 0;
size_t numNoSeedRead = 0;
size_t repeatCutoff = kd.getRepeatKmerCutoff();
size_t errorCutoff = kd.getMedian() - kd.getSdv();
Timer timer("test", true);
n_calls++;
//std::cout<<"PacBioRetrieveMatches\n";
std::cout<<"\tk :\t"<<k<<"\n";
SequenceOverlapPairVector overlap_vector;
std::vector<long> identityVector(100);
for(int j = 0;j < identityVector.size(); j++)
identityVector[j] = 0;
// Use the FM-index to look up intervals for each kmer of the read. Each index
// in the interval is stored individually in the KmerMatchMap. We then
// backtrack to map these kmer indices to read IDs. As reads can share
// multiple kmers, we use the map to avoid redundant lookups.
// There is likely a faster algorithm which performs direct decompression
// of the read sequences without having to expand the intervals to individual
// indices. The current algorithm suffices for now.
KmerMatchMap prematchMap;
size_t num_kmers = query.size() - k + 1;
clock_t search_seeds_s = clock(), search_seeds_e;
for(size_t i = 0; i < num_kmers; i++)
{
std::string kmer = query.substr(i, k);
BWTInterval interval = BWTAlgorithms::findInterval(indices, kmer);
if(interval.upper - interval.lower < errorCutoff)
numNoSeedRead++;
if((interval.upper - interval.lower) > 20 && (interval.upper - interval.lower) < repeatCutoff)
{
numKmer++;
totalKmer++;
}
//To avoid the repeat region
/*if((interval.upper - interval.lower) > repeatCutoff)
{
numRepeatKmer++;
totalKmer++;
continue;
}
else
interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper;*/
if(interval.isValid() && interval.size())
{
//std::cout<<"\tinterval size : "<<interval.upper - interval.lower<<std::endl;
for(int64_t j = interval.lower; j <= interval.upper; ++j)
{
KmerMatch match = { i, static_cast<size_t>(j), false };
prematchMap.insert(std::make_pair(match, false));
}
intervalSum += interval.upper - interval.lower;
count++;
}
kmer = reverseComplement(kmer);
interval = BWTAlgorithms::findInterval(indices, kmer);
interval.upper = ((interval.upper - interval.lower)>20)?interval.lower + 20 : interval.upper;
if(interval.isValid() && interval.size())
{
for(int64_t j = interval.lower; j <= interval.upper; ++j)
{
KmerMatch match = { i, static_cast<size_t>(j), true };
prematchMap.insert(std::make_pair(match, false));
}
intervalSum += interval.upper - interval.lower;
count++;
}
}
if(numNoSeedRead == num_kmers)
std::cout<<"\tnoSeedRead : 1"<<std::endl;
std::cout<<"\tnumber of kmer : "<<numKmer<<std::endl;
std::cout<<"\tnumber of RepeatKmer : "<<numRepeatKmer<<std::endl;
std::cout<<"\tnumber of totalkmer : "<<totalKmer<<std::endl;
//.........这里部分代码省略.........