本文整理汇总了C++中tokenizer::set_text方法的典型用法代码示例。如果您正苦于以下问题:C++ tokenizer::set_text方法的具体用法?C++ tokenizer::set_text怎么用?C++ tokenizer::set_text使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizer
的用法示例。
在下文中一共展示了tokenizer::set_text方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: tag_xml
void tag_xml(istream& is, ostream& os, const tagger& tagger, tokenizer& tokenizer, const tagset_converter& tagset_converter, const derivation_formatter& derivation, morpho::guesser_mode guesser) {
string para;
vector<string_piece> forms;
vector<tagged_lemma> tags;
while (getpara(is, para)) {
// Tokenize and tag
tokenizer.set_text(para);
const char* unprinted = para.c_str();
while (tokenizer.next_sentence(&forms, nullptr)) {
tagger.tag(forms, tags, guesser);
for (unsigned i = 0; i < forms.size(); i++) {
tagset_converter.convert(tags[i]);
derivation.format_derivation(tags[i].lemma);
os << xml_encoded(string_piece(unprinted, forms[i].str - unprinted));
if (!i) os << "<sentence>";
os << "<token lemma=\"" << xml_encoded(tags[i].lemma, true) << "\" tag=\"" << xml_encoded(tags[i].tag, true) << "\">"
<< xml_encoded(forms[i]) << "</token>";
if (i + 1 == forms.size()) os << "</sentence>";
unprinted = forms[i].str + forms[i].len;
}
}
os << xml_encoded(string_piece(unprinted, para.c_str() + para.size() - unprinted)) << flush;
}
}
示例2: recognize_vertical
void recognize_vertical(istream& is, ostream& os, const ner& recognizer, tokenizer& tokenizer) {
string para;
vector<string_piece> forms;
vector<named_entity> entities;
unsigned total_tokens = 0;
string entity_ids, entity_text;
while (getpara(is, para)) {
// Tokenize and tag
tokenizer.set_text(para);
while (tokenizer.next_sentence(&forms, nullptr)) {
recognizer.recognize(forms, entities);
sort_entities(entities);
for (auto&& entity : entities) {
entity_ids.clear();
entity_text.clear();
for (auto i = entity.start; i < entity.start + entity.length; i++) {
if (i > entity.start) {
entity_ids += ',';
entity_text += ' ';
}
entity_ids += to_string(total_tokens + i + 1);
entity_text.append(forms[i].str, forms[i].len);
}
os << entity_ids << '\t' << entity.type << '\t' << entity_text << '\n';
}
os << flush;
total_tokens += forms.size() + 1;
}
}
}
示例3: tokenize_vertical
void tokenize_vertical(istream& is, ostream& os, tokenizer& tokenizer) {
string para;
vector<string_piece> forms;
while (getpara(is, para)) {
// Tokenize
tokenizer.set_text(para);
while (tokenizer.next_sentence(&forms, nullptr)) {
for (auto&& form : forms) {
os << form << '\n';
}
os << '\n' << flush;
}
}
}
示例4: tag_vertical
void tag_vertical(istream& is, ostream& os, const tagger& tagger, tokenizer& tokenizer, const tagset_converter& tagset_converter, const derivation_formatter& derivation, morpho::guesser_mode guesser) {
string para;
vector<string_piece> forms;
vector<tagged_lemma> tags;
while (getpara(is, para)) {
// Tokenize and tag
tokenizer.set_text(para);
while (tokenizer.next_sentence(&forms, nullptr)) {
tagger.tag(forms, tags, guesser);
for (unsigned i = 0; i < tags.size(); i++) {
tagset_converter.convert(tags[i]);
derivation.format_derivation(tags[i].lemma);
os << forms[i] << '\t' << tags[i].lemma << '\t' << tags[i].tag << '\n';
}
os << endl;
}
}
}
示例5: tokenize_xml
static void tokenize_xml(istream& is, ostream& os, tokenizer& tokenizer) {
string para;
vector<string_piece> forms;
while (getpara(is, para)) {
// Tokenize
tokenizer.set_text(para);
const char* unprinted = para.c_str();
while (tokenizer.next_sentence(&forms, nullptr))
for (unsigned i = 0; i < forms.size(); i++) {
if (unprinted < forms[i].str) os << xml_encoded(string_piece(unprinted, forms[i].str - unprinted));
if (!i) os << "<sentence>";
os << "<token>" << xml_encoded(forms[i]) << "</token>";
if (i + 1 == forms.size()) os << "</sentence>";
unprinted = forms[i].str + forms[i].len;
}
if (unprinted < para.c_str() + para.size()) os << xml_encoded(string_piece(unprinted, para.c_str() + para.size() - unprinted));
os << flush;
}
}
示例6: recognize_untokenized
void recognize_untokenized(istream& is, ostream& os, const ner& recognizer, tokenizer& tokenizer) {
string para;
vector<string_piece> forms;
vector<named_entity> entities;
vector<size_t> entity_ends;
while (getpara(is, para)) {
// Tokenize the text and find named entities
tokenizer.set_text(para);
const char* unprinted = para.c_str();
while (tokenizer.next_sentence(&forms, nullptr)) {
recognizer.recognize(forms, entities);
sort_entities(entities);
for (unsigned i = 0, e = 0; i < forms.size(); i++) {
if (unprinted < forms[i].str) os << xml_encoded(string_piece(unprinted, forms[i].str - unprinted));
if (i == 0) os << "<sentence>";
// Open entities starting at current token
for (; e < entities.size() && entities[e].start == i; e++) {
os << "<ne type=\"" << xml_encoded(entities[e].type, true) << "\">";
entity_ends.push_back(entities[e].start + entities[e].length - 1);
}
// The token itself
os << "<token>" << xml_encoded(forms[i]) << "</token>";
// Close entities ending after current token
while (!entity_ends.empty() && entity_ends.back() == i) {
os << "</ne>";
entity_ends.pop_back();
}
if (i + 1 == forms.size()) os << "</sentence>";
unprinted = forms[i].str + forms[i].len;
}
}
// Write rest of the text (should be just spaces)
if (unprinted < para.c_str() + para.size()) os << xml_encoded(string_piece(unprinted, para.c_str() + para.size() - unprinted));
os << flush;
}
}
示例7: recognize_conll
void recognize_conll(istream& is, ostream& os, const ner& recognizer, tokenizer& tokenizer) {
string para;
vector<string_piece> forms;
vector<named_entity> entities;
while (getpara(is, para)) {
// Tokenize and tag
tokenizer.set_text(para);
while (tokenizer.next_sentence(&forms, nullptr)) {
recognizer.recognize(forms, entities);
sort_entities(entities);
string entity_type;
unsigned in_entity = 0;
bool entity_start;
for (unsigned i = 0, e = 0; i < forms.size(); i++) {
if (!in_entity && e < entities.size() && entities[e].start == i) {
in_entity = entities[e].length;
entity_start = true;
entity_type = entities[e].type;
e++;
}
os << forms[i] << '\t';
if (in_entity) {
os << (entity_start ? "B-" : "I-") << entity_type;
entity_start = false;
in_entity--;
} else {
os << '_';
}
os << '\n';
}
os << '\n' << flush;
}
}
}