本文整理汇总了C++中Words::set方法的典型用法代码示例。如果您正苦于以下问题:C++ Words::set方法的具体用法?C++ Words::set怎么用?C++ Words::set使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Words
的用法示例。
在下文中一共展示了Words::set方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: addMatches
bool Matches::addMatches( char *s, int32_t slen, mf_t flags ) {
// . do not breach
// . happens a lot with a lot of link info text
if ( m_numMatchGroups >= MAX_MATCHGROUPS ) {
return true;
}
// get some new ptrs for this match group
Words *wp = &m_wordsArray [ m_numMatchGroups ];
Bits *bp = &m_bitsArray [ m_numMatchGroups ];
Pos *pb = &m_posArray [ m_numMatchGroups ];
// set the words class for this match group
if ( !wp->set( s, slen, true ) ) {
return false;
}
// bits vector
if ( ! bp->setForSummary ( wp ) ) {
return false;
}
// position vector
if ( ! pb->set ( wp ) ) {
return false;
}
// record the start
int32_t startNumMatches = m_numMatches;
// sometimes it returns true w/o incrementing this
int32_t n = m_numMatchGroups;
// . add all the Match classes from this match group
// . this increments m_numMatchGroups on success
bool status = addMatches( wp, NULL, NULL, bp, pb, flags );
// if this matchgroup had some, matches, then keep it
if ( m_numMatches > startNumMatches ) {
return status;
}
// otherwise, reset it, useless
wp->reset();
bp->reset();
pb->reset();
// do not decrement the counter if we never incremented it
if ( n == m_numMatchGroups ) {
return status;
}
// ok, remove it
m_numMatchGroups--;
return status;
}
示例2: parse_doc_icu
void parse_doc_icu(char *s, int len, bool doHash, char *charset){
Xml xml;
xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML );
// Extract text from (x)html
char *text_buf = (char*)malloc(64*1024);
int32_t textLen = xml.getText( text_buf, 64 * 1024, 0, 99999999, doFilterSpaces );
Words w;
w.set(text_buf, textLen, doHash);
free(text_buf);
}
示例3: parse_doc_8859_1
void parse_doc_8859_1(char *s, int len, bool doHash,char *charset)
{
Xml xml;
xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML );
// Extract text from (x)html
char *text_buf = (char*)malloc(len+1);
xml.getText( text_buf, len, 0, 99999999, doFilterSpaces );
Words words;
// just tokenize words
words.set(text_buf, len, doHash);
free(text_buf);
}
示例4: parse_doc_icu
void parse_doc_icu(char *s, int len, bool doHash, char *charset){
Xml xml;
xml.set(csUTF8,s,len,false, 0,false, TITLEREC_CURRENT_VERSION);
//fprintf(stderr,"\nparse_doc_icu\n");
// Extract text from (x)html
char *text_buf = (char*)malloc(64*1024);
long textLen = xml.getText(text_buf,
64*1024,
0,
99999999,
false,
true,
false,
doFilterSpaces,
false);
Words w;
w.set(true,false, text_buf, textLen, TITLEREC_CURRENT_VERSION,doHash);
free(text_buf);
}
示例5: generateSummary
static void generateSummary( Summary &summary, char *htmlInput, const char *queryStr, const char *urlStr ) {
Xml xml;
ASSERT_TRUE(xml.set(htmlInput, strlen(htmlInput), 0, CT_HTML));
Words words;
ASSERT_TRUE(words.set(&xml, true));
Bits bits;
ASSERT_TRUE(bits.set(&words));
Url url;
url.set(urlStr);
Sections sections;
ASSERT_TRUE(sections.set(&words, &bits, &url, "", CT_HTML));
Query query;
ASSERT_TRUE(query.set2(queryStr, langEnglish, true));
LinkInfo linkInfo;
memset ( &linkInfo , 0 , sizeof(LinkInfo) );
linkInfo.m_lisize = sizeof(LinkInfo);
Title title;
ASSERT_TRUE(title.setTitle(&xml, &words, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish));
Pos pos;
ASSERT_TRUE(pos.set(&words));
Bits bitsForSummary;
ASSERT_TRUE(bitsForSummary.setForSummary(&words));
Phrases phrases;
ASSERT_TRUE(phrases.set(&words, &bits));
Matches matches;
matches.setQuery(&query);
ASSERT_TRUE(matches.set(&words, &phrases, §ions, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo));
summary.setSummary(&xml, &words, §ions, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen());
}
示例6: parse_doc_8859_1
void parse_doc_8859_1(char *s, int len, bool doHash,char *charset)
{
Xml xml;
xml.set(csASCII,s,len,false, 0, false, TITLEREC_CURRENT_VERSION);
//fprintf(stderr,"\nparse_doc_8859_1\n");
// Extract text from (x)html
char *text_buf = (char*)malloc(len+1);
xml.getText(text_buf,
len,
0,
99999999,
false,
true,
false,
doFilterSpaces,
false);
Words words;
// just tokenize words
words.set(false, text_buf, TITEREC_CURRENT_VERSION, doHash);
free(text_buf);
}
示例7: processLoop
// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
// get it
State2 *st = (State2 *)state;
// get the tcp socket from the state
TcpSocket *s = st->m_socket;
// get it
XmlDoc *xd = &st->m_xd;
if ( ! xd->m_loaded ) {
// setting just the docid. niceness is 0.
//xd->set3 ( st->m_docId , st->m_coll , 0 );
// callback
xd->setCallback ( state , processLoop );
// . and tell it to load from the old title rec
// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
// . this sets xd->ptr_* and all other member vars from
// the old title rec if found in titledb.
if ( ! xd->loadFromOldTitleRec ( ) ) return false;
}
if ( g_errno ) return sendErrorReply ( st , g_errno );
// now force it to load old title rec
//char **tr = xd->getTitleRec();
SafeBuf *tr = xd->getTitleRecBuf();
// blocked? return false if so. it will call processLoop() when it rets
if ( tr == (void *)-1 ) return false;
// we did not block. check for error? this will free "st" too.
if ( ! tr ) return sendErrorReply ( st , g_errno );
// if title rec was empty, that is a problem
if ( xd->m_titleRecBuf.length() == 0 )
return sendErrorReply ( st , ENOTFOUND);
// set callback
char *na = xd->getIsNoArchive();
// wait if blocked
if ( na == (void *)-1 ) return false;
// error?
if ( ! na ) return sendErrorReply ( st , g_errno );
// forbidden? allow turkeys through though...
if ( ! st->m_isAdmin && *na )
return sendErrorReply ( st , ENOCACHE );
SafeBuf *sb = &st->m_sb;
// &page=4 will print rainbow sections
if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
// do not repeat this call
st->m_printed = true;
// this will call us again since we called
// xd->setCallback() above to us
if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
return false;
}
char *contentType = "text/html";
char format = st->m_format;
if ( format == FORMAT_XML ) contentType = "text/xml";
if ( format == FORMAT_JSON ) contentType = "application/json";
// if we printed a special page (like rainbow sections) then return now
if ( st->m_printed ) {
bool status = g_httpServer.sendDynamicPage (s,
//buf,bufLen,
sb->getBufStart(),
sb->getLength(),
-1,false,
//"text/html",
contentType,
-1, NULL, "utf8" );
// nuke state2
mdelete ( st , sizeof(State2) , "PageGet1" );
delete (st);
return status;
}
/*
// this was calling XmlDoc and setting sections, etc. to
// get the SpiderReply junk... no no no
// is it banned or filtered? this ignores the TagRec in the titleRec
// and uses msg8a to get it fresh instead
char *vi = xd->getIsFiltered();//Visible( );
// wait if blocked
if ( vi == (void *)-1 ) return false;
// error?
if ( ! vi ) return sendErrorReply ( st , g_errno );
// banned?
if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
*/
// get the utf8 content
char **utf8 = xd->getUtf8Content();
//long len = xd->size_utf8Content - 1;
// wait if blocked???
if ( utf8 == (void *)-1 ) return false;
// strange
if ( xd->size_utf8Content<=0) {
log("pageget: utf8 content <= 0");
return sendErrorReply(st,EBADENGINEER );
//.........这里部分代码省略.........
示例8: setTitle
// returns false and sets g_errno on error
bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, Query *query,
LinkInfo *linkInfo, Url *firstUrl, const char *filteredRootTitleBuf, int32_t filteredRootTitleBufSize,
uint8_t contentType, uint8_t langId, int32_t niceness ) {
// make Msg20.cpp faster if it is just has
// Msg20Request::m_setForLinkInfo set to true, no need to extricate a title.
if ( maxTitleLen <= 0 ) {
return true;
}
m_niceness = niceness;
m_maxTitleLen = maxTitleLen;
// if this is too big the "first line" algo can be huge!!!
// and really slow everything way down with a huge title candidate
int32_t maxTitleWords = 128;
// assume no title
reset();
int32_t NW = words->getNumWords();
//
// now get all the candidates
//
// . allow up to 100 title CANDIDATES
// . "as" is the word # of the first word in the candidate
// . "bs" is the word # of the last word IN the candidate PLUS ONE
int32_t n = 0;
int32_t as[MAX_TIT_CANDIDATES];
int32_t bs[MAX_TIT_CANDIDATES];
float scores[MAX_TIT_CANDIDATES];
Words *cptrs[MAX_TIT_CANDIDATES];
int32_t types[MAX_TIT_CANDIDATES];
int32_t parent[MAX_TIT_CANDIDATES];
// record the scoring algos effects
float baseScore [MAX_TIT_CANDIDATES];
float noCapsBoost [MAX_TIT_CANDIDATES];
float qtermsBoost [MAX_TIT_CANDIDATES];
float inCommonCandBoost[MAX_TIT_CANDIDATES];
// reset these
for ( int32_t i = 0 ; i < MAX_TIT_CANDIDATES ; i++ ) {
// assume no parent
parent[i] = -1;
}
// xml and words class for each link info, rss item
Xml tx[MAX_TIT_CANDIDATES];
Words tw[MAX_TIT_CANDIDATES];
int32_t ti = 0;
// restrict how many link texts and rss blobs we check for titles
// because title recs like www.google.com have hundreds and can
// really slow things down to like 50ms for title generation
int32_t kcount = 0;
int32_t rcount = 0;
//int64_t x = gettimeofdayInMilliseconds();
// . get every link text
// . TODO: repeat for linkInfo2, the imported link text
for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) {
// breathe
QUICKPOLL(m_niceness);
// fast skip check for link text
if ( k->size_linkText >= 3 && ++kcount >= 20 ) continue;
// fast skip check for rss item
if ( k->size_rssItem > 10 && ++rcount >= 20 ) continue;
// set Url
Url u;
u.set( k->getUrl(), k->size_urlBuf );
// is it the same host as us?
bool sh = true;
// skip if not from same host and should be
if ( firstUrl->getHostLen() != u.getHostLen() ) {
sh = false;
}
// skip if not from same host and should be
if ( strncmp( firstUrl->getHost(), u.getHost(), u.getHostLen() ) ) {
sh = false;
}
// get the link text
if ( k->size_linkText >= 3 ) {
char *p = k->getLinkText();
int32_t plen = k->size_linkText - 1;
if ( ! verifyUtf8 ( p , plen ) ) {
log("title: set4 bad link text from url=%s", k->getUrl());
continue;
}
// now the words.
if ( !tw[ti].set( k->getLinkText(), k->size_linkText - 1, true, 0 ) ) {
//.........这里部分代码省略.........
示例9: set
// . return length stored into "buf"
// . content must be NULL terminated
// . if "useAnchors" is true we do click and scroll
// . if "isQueryTerms" is true, we do typical anchors in a special way
int32_t Highlight::set ( SafeBuf *sb,
//char *buf ,
//int32_t bufLen ,
char *content ,
int32_t contentLen ,
// primary language of the document (for synonyms)
char docLangId ,
Query *q ,
bool doStemming ,
bool useAnchors ,
const char *baseUrl ,
const char *frontTag ,
const char *backTag ,
int32_t fieldCode ,
int32_t niceness ) {
Words words;
if ( ! words.set ( content ,
contentLen ,
TITLEREC_CURRENT_VERSION,
true , // computeId
true ) ) // has html entites?
return -1;
int32_t version = TITLEREC_CURRENT_VERSION;
Bits bits;
if ( ! bits.set (&words,version,niceness) ) return -1;
Phrases phrases;
if ( !phrases.set(&words,&bits,true,false,version,niceness))return -1;
//SafeBuf langBuf;
//if ( !setLangVec ( &words , &langBuf , niceness )) return 0;
//uint8_t *langVec = (uint8_t *)langBuf.getBufStart();
// make synonyms
//Synonyms syns;
//if(!syns.set(&words,NULL,docLangId,&phrases,niceness,NULL)) return 0;
Matches matches;
matches.setQuery ( q );
if ( ! matches.addMatches ( &words , &phrases ) ) return -1;
// store
m_numMatches = matches.getNumMatches();
return set ( sb ,
//buf ,
//bufLen ,
&words ,
&matches ,
doStemming ,
useAnchors ,
baseUrl ,
frontTag ,
backTag ,
fieldCode ,
q );
}