本文整理汇总了C++中Vocab类的典型用法代码示例。如果您正苦于以下问题:C++ Vocab类的具体用法?C++ Vocab怎么用?C++ Vocab使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Vocab类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: PrintSentence
void ParallelCorpus::PrintSentence(
const Sentence& sentence, const Vocab& vocab, std::ostream& out) const {
if (sentence.size() > 0) {
out << vocab.GetWord(sentence.at(0));
}
for (int i = 1; i < sentence.size(); ++i) {
out << " " << vocab.GetWord(sentence.at(i));
}
}
示例2: Print
void PackedTrie::Print(const Vocab& source_vocab, const Vocab& target_vocab,
std::ostream& out) const {
for (int s = 0; s < source_count_; ++s) {
for (int i = offsets_[s]; i < offsets_[s + 1]; ++i) {
out << source_vocab.GetWord(s) << "\t"
<< target_vocab.GetWord(target_words_[i]) << "\t"
<< exp(data_[i]) << std::endl;
}
}
}
示例3: getBigramProb
//Get P(W2 | W1) -- bigram
double getBigramProb(const char *w1, const char *w2, Vocab &voc, Ngram &lm){
VocabIndex wid1 = voc.getIndex(w1);
VocabIndex wid2 = voc.getIndex(w2);
if(wid1 == Vocab_None) //OOV
wid1 = voc.getIndex(Vocab_Unknown);
if(wid2 == Vocab_None){ //OOV
wid2 = voc.getIndex(Vocab_Unknown);
return -20;
}
VocabIndex context[] = { wid1, Vocab_None };
return lm.wordProb( wid2, context);
}
示例4: CreateFromString
void Word::CreateFromString(const std::string &inString, Vocab &vocab)
{
if (inString.substr(0, 1) == "[" && inString.substr(inString.size() - 1, 1) == "]") {
// non-term
m_isNonTerminal = true;
string str = inString.substr(1, inString.size() - 2);
m_vocabId = vocab.AddVocabId(str);
} else {
m_isNonTerminal = false;
m_vocabId = vocab.AddVocabId(inString);
}
}
示例5: CreateVocabMap
void CorpusReader::CreateVocabMap(const Vocab& corpus_vocab,
const vector< vector<string> >& filter_vocab,
vector<IntIntMap>* lookup) {
assert(corpus_vocab.has_language());
int lang = corpus_vocab.language();
if (lang >= (int)lookup->size()) lookup->resize(lang + 1);
if (filter_vocab[lang].size() > 0) {
cout << "Adding vocab for language " << lang << "(" <<
corpus_vocab.terms().size() << ")" << endl;
CreateFilteredMap(corpus_vocab, filter_vocab[lang], &(*lookup)[lang]);
} else {
cout << "Skipping language " << lang << endl;
}
}
示例6: CreateUnfilteredMap
void CorpusReader::CreateUnfilteredMap(const Vocab& proto_voc,
StringIntMap* lookup,
IntIntMap* mapping) {
for (int ii = 0; ii < proto_voc.terms_size(); ++ii) {
const lib_corpora_proto::Vocab_Entry& word = proto_voc.terms(ii);
string term = word.original();
if (lookup->find(term) == lookup->end()) {
int new_id = lookup->size();
(*lookup)[term] = new_id;
// cout << "Adding " << term << " with id " << new_id << endl;
}
(*mapping)[word.id()] = (*lookup)[term];
// cout << "---------------" << endl;
}
}
示例7: ConvertToMoses
void Word::ConvertToMoses(
const std::vector<Moses::FactorType> &outputFactorsVec,
const Vocab &vocab,
Moses::Word &overwrite) const {
Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
overwrite = Moses::Word(m_isNonTerminal);
// TODO: this conversion should have been done at load time.
util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');
for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
UTIL_THROW_IF(!tok, util::Exception, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
overwrite.SetFactor(*t, factorColl.AddFactor(*tok));
}
UTIL_THROW_IF(tok, util::Exception, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
}
示例8: LoadVocabulary
void SparseHieroReorderingFeature::LoadVocabulary(const std::string& filename, Vocab& vocab)
{
if (filename.empty()) return;
ifstream in(filename.c_str());
UTIL_THROW_IF(!in, util::Exception, "Unable to open vocab file: " << filename);
string line;
while(getline(in,line)) {
vocab.insert(FactorCollection::Instance().AddFactor(line));
}
in.close();
}
示例9: CreateFilteredMap
void CorpusReader::CreateFilteredMap(const Vocab& corpus_voc,
const vector<string>& filter_voc,
IntIntMap* id_lookup) {
map<string, int> new_id;
// RHS will be new vocab
for (int ii = 0; ii < (int)filter_voc.size(); ++ii) {
new_id[filter_voc[ii]] = ii;
}
// LHS will be old vocab
for (int ii = 0; ii < corpus_voc.terms_size(); ++ii) {
const lib_corpora_proto::Vocab_Entry& word = corpus_voc.terms(ii);
string term = word.original();
if (new_id.find(term) != new_id.end()) {
(*id_lookup)[word.id()] = new_id[term];
// cout << word.id() << "->" << new_id[term] << "(term)" << endl;
}
}
}
示例10: extractBinaryfromStream
void extractBinaryfromStream(const char * inputStream, Vocab & textHash,
vector < tuple <int *, int > > & src_batch, vector < tuple <int *, int > > & tgt_batch, int isFilter, int debugLines)
{
ifstream infile;
infile.open(inputStream, ifstream::in);
string line;
int lineIdx = 0;
while (getline(infile, line))
{
stringstream linestream(line);
string src, tgt;
getline(linestream, src, '\t');
getline(linestream, tgt, '\t');
int src_token_num = 0;
int tgt_token_num = 0;
char** src_tokens = BasicUtil::TokenizeString(src, src_token_num, MAX_TOKEN_NUM, MAX_TOKEN_LEN);
char** tgt_tokens = BasicUtil::TokenizeString(tgt, tgt_token_num, MAX_TOKEN_NUM, MAX_TOKEN_LEN);
int * src_fea = new int[MAX_TOKEN_LEN * MAX_TOKEN_NUM];
int * src_seg = new int[MAX_TOKEN_NUM];
int * tgt_fea = new int[MAX_TOKEN_LEN * MAX_TOKEN_NUM];
int * tgt_seg = new int[MAX_TOKEN_NUM];
int src_seg_num = textHash.FeatureExtract((const char **)src_tokens, src_token_num, src_seg, src_fea);
int tgt_seg_num = textHash.FeatureExtract((const char **)tgt_tokens, tgt_token_num, tgt_seg, tgt_fea);
int src_feature_num = 0; //src_seg[src_seg_num - 1];
int tgt_feature_num = 0; //tgt_seg[tgt_seg_num - 1];
if(src_seg_num >= 1)
{
src_feature_num = src_seg[src_seg_num - 1];
}
if(tgt_seg_num >= 1)
{
tgt_feature_num = tgt_seg[tgt_seg_num - 1];
}
if(isFilter == 1)
{
if(src_feature_num <= 0 || tgt_feature_num <= 0) continue;
}
src_batch.push_back(tuple<int*, int>(src_fea, src_feature_num));
tgt_batch.push_back(tuple<int*, int>(tgt_fea, tgt_feature_num));
lineIdx += 1;
if(lineIdx == debugLines) break;
}
}
示例11: CreateFromString
void Word::CreateFromString(const std::string &inString, Vocab &vocab)
{
if (inString.substr(0, 1) == "[" && inString.substr(inString.size() - 1, 1) == "]")
{ // non-term
m_isNonTerminal = true;
}
else
{
m_isNonTerminal = false;
}
m_factors.resize(1);
m_factors[0] = vocab.AddVocabId(inString);
}
示例12: convert_trees_to_indexed_minibatches
treebank_minibatch_dataset convert_trees_to_indexed_minibatches(
const Vocab& word_vocab,
const std::vector<AnnotatedParseTree::shared_tree>& trees,
int minibatch_size) {
treebank_minibatch_dataset dataset;
auto to_index_pair = [&word_vocab](std::pair<std::vector<std::string>, uint>&& pair, bool&& is_root) {
return std::tuple<std::vector<uint>, uint, bool>(
word_vocab.encode(pair.first),
pair.second,
is_root);
};
if (dataset.size() == 0)
dataset.emplace_back(0);
for (auto& tree : trees) {
// create new minibatch
if (dataset[dataset.size()-1].size() == minibatch_size) {
dataset.emplace_back(0);
dataset.back().reserve(minibatch_size);
}
// add root
dataset[dataset.size()-1].emplace_back(
to_index_pair(
tree->to_labeled_pair(),
true
)
);
// add children:
for (auto& child : tree->general_children) {
if (dataset[dataset.size()-1].size() == minibatch_size) {
dataset.emplace_back(0);
dataset.back().reserve(minibatch_size);
}
dataset[dataset.size()-1].emplace_back(
to_index_pair(
child->to_labeled_pair(),
false
)
);
}
}
return dataset;
}
示例13: add_example
void add_example(
const Vocab& vocab,
const vector<string>& example_orig,
size_t& example_idx) {
int len = std::min(example_orig.size(), (size_t)FLAGS_max_sentence_length);
vector<string> example(example_orig.begin(), example_orig.begin() + len);
auto description_length = example.size();
this->data.w(0, example_idx) = vocab.word2index.at(START);
auto encoded = vocab.encode(example, true);
this->mask.w(0, example_idx) = 0.0;
for (size_t j = 0; j < encoded.size(); j++) {
this->data.w(j + 1, example_idx) = encoded[j];
this->mask.w(j + 1, example_idx) = (R)1.0;
}
this->code_lengths[example_idx] = description_length + 1;
this->total_codes += description_length + 1;
}
示例14: AddLine
void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab)
{
//cerr << line << endl;
NgramCounter ngramCounts;
list<WordVec> openNgrams;
size_t length = 0;
//tokenize & count
for (util::TokenIter<util::SingleCharacter, true> j(line, util::SingleCharacter(' ')); j; ++j) {
const Vocab::Entry* nextTok = &(vocab.FindOrAdd(*j));
++length;
openNgrams.push_front(WordVec());
for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) {
k->push_back(nextTok);
++ngramCounts[*k];
}
if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back();
}
//merge into overall ngram map
for (NgramCounter::const_iterator ni = ngramCounts.begin();
ni != ngramCounts.end(); ++ni) {
size_t count = ni->second;
//cerr << *ni << " " << count << endl;
if (ngramCounts_.size() <= sentenceId) ngramCounts_.resize(sentenceId+1);
NgramMap::iterator totalsIter = ngramCounts_[sentenceId].find(ni->first);
if (totalsIter == ngramCounts_[sentenceId].end()) {
ngramCounts_[sentenceId][ni->first] = pair<size_t,size_t>(count,count);
} else {
ngramCounts_[sentenceId][ni->first].first = max(count, ngramCounts_[sentenceId][ni->first].first); //clip
ngramCounts_[sentenceId][ni->first].second += count; //no clip
}
}
//length
if (lengths_.size() <= sentenceId) lengths_.resize(sentenceId+1);
//TODO - length strategy - this is MIN
if (!lengths_[sentenceId]) {
lengths_[sentenceId] = length;
} else {
lengths_[sentenceId] = min(length,lengths_[sentenceId]);
}
//cerr << endl;
}
示例15: forwardish
// Returns a vector of LiveGuessResults
// warning: words is mutated temporarily
std::auto_ptr< std::vector<LiveGuessResult> >
forwardish(std::vector<const char *> & words, // the current words can be empty
const double currentProb, // log prob
const int size, // how many to grab
const int depthLeft,
const NgramLM & _lm,
const int _order,
const Vocab & vocab ) {
// Index contains the last ngram word
//Logger::Log(0, "Forwardish [%d] [%d]\n", depthLeft, index);
VocabIndex vwords[ _order ];
//int n = (words.size() < (_order - 1))?words.size():_order;
//for (int i = words.size() - _order - 1; i < words.size(); i++) {
// if ( i >= 0) {
// Logger::Log(0,"Word: %d %s\n",i,words[i]);
// }
//}
//vwords[0] to _order -1 are filled in
// if it's small EndOfSentence starts it..
for (int i = 1; i < _order; i++) {
int j = words.size() - _order + i;
if (j < 0) {
vwords[i - 1] = Vocab::Invalid; // probably should be end of sentence
} else {
vwords[i - 1] = vocab.Find( words[ j ] );
}
}
vector<VocabProb> heap(0);
mkHeap(heap);
const ProbVector & probabilities = _lm.probs( _order ) ;// _order - 2 );
const CountVector & counts = _lm.counts( _order );
int count = 0;
//Logger::Log(0, "Find probabilities %d\n",vocab.size());
for (int j = 0; j < vocab.size(); j++) {
VocabIndex vWordI = j;//vocab[j];
vwords[ _order - 1 ] = j;
NgramIndex newIndex = _lm.model()._Find( vwords, _order );
if (newIndex == -1) { // not legit :(
continue;
}
Prob probRaw = probabilities[ newIndex ];
if (probRaw == 0.0) {
continue;
}
Prob prob = -1 * log( probRaw ); //biggest is smallest
//Prob prob = (probRaw == 0.0)?10000:(-1 * log( probRaw )); //biggest is smallest
//Prob probRaw = (counts[newIndex]==0)?1.0:counts[newIndex]/vocab.size()
//Prob prob = -1 * log(probRaw);
//Prob prob = -1 * counts[newIndex];
//Logger::Log(0, "Prob %e\n",prob);
const VocabProb v( prob,j, newIndex);
if ( count < size ) {
heap.push_back( v );
count++;
if (count == size) {
mkHeap( heap );
}
// this is irritating, basically it means the highest rank stuff
// will be in the list and we only kick out the lowest ranked stuff
// (which will be the GREATEST of what is already there)
//
} else if ( heap.front().prob > prob ) {
// this is dumb
// remove the least element
popHeap( heap );
pushHeap( heap, v );
// should we update?
}
}
sortHeap( heap );
std::vector<LiveGuessResult> * resVector = new std::vector<LiveGuessResult>();
for( int j = 0; j < heap.size(); j++) {
VocabProb v = heap[ j ];
Prob prob = v.prob;
prob += currentProb;
const char * word = vocab[ v.index ];
vector<const char *> ourWords(words);
ourWords.push_back( word ); // add
char * str = joinVectorOfCStrings( ourWords ); // Remember to deallocate later :(
//.........这里部分代码省略.........