本文整理汇总了C++中Words::getNumWords方法的典型用法代码示例。如果您正苦于以下问题:C++ Words::getNumWords方法的具体用法?C++ Words::getNumWords怎么用?C++ Words::getNumWords使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Words
的用法示例。
在下文中一共展示了Words::getNumWords方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1:
// langId is language of the query
long long getSynBaseHash64 ( char *qstr , uint8_t langId ) {
Words ww;
ww.set3 ( qstr );
long nw = ww.getNumWords();
long long *wids = ww.getWordIds();
//char **wptrs = ww.getWords();
//long *wlens = ww.getWordLens();
long long baseHash64 = 0LL;
Synonyms syn;
// assume english if unknown to fix 'pandora's tower'
// vs 'pandoras tower' where both words are in both
// english and german so langid is unknown
if ( langId == langUnknown ) langId = langEnglish;
// . store re-written query into here then hash that string
// . this way we can get rid of spaces
//char rebuf[1024];
//char *p = rebuf;
//if ( strstr(qstr,"cheatcodes") )
// log("hey");
// for deduping
HashTableX dups;
if ( ! dups.set ( 8,0,1024,NULL,0,false,0,"qhddup") ) return false;
// scan the words
for ( long i = 0 ; i < nw ; i++ ) {
// skip if not alnum
if ( ! wids[i] ) continue;
// get its synonyms into tmpBuf
char tmpBuf[TMPSYNBUFSIZE];
// . assume niceness of 0 for now
// . make sure to get all synsets!! ('love' has two synsets)
long naids = syn.getSynonyms (&ww,i,langId,tmpBuf,0);
// term freq algo
//long pop = g_speller.getPhrasePopularity(NULL,
// wids[i],
// true,
// langId);
// is it a queryStopWord like "the" or "and"?
bool isQueryStop = ::isQueryStopWord(NULL,0,wids[i]);
// a more restrictive list
bool isStop = ::isStopWord(NULL,0,wids[i]);
if ( ::isCommonQueryWordInEnglish(wids[i]) ) isStop = true;
// find the smallest one
unsigned long long min = wids[i];
//char *minWordPtr = wptrs[i];
//long minWordLen = wlens[i];
// declare up here since we have a goto below
long j;
// add to table too
if ( dups.isInTable ( &min ) ) goto gotdup;
// add to it
if ( ! dups.addKey ( &min ) ) return false;
// now scan the synonyms, they do not include "min" in them
for ( j = 0 ; j < naids ; j++ ) {
// get it
unsigned long long aid64;
aid64 = (unsigned long long)syn.m_aids[j];
// if any syn already hashed then skip it and count
// as a repeated term. we have to do it this way
// rather than just getting the minimum synonym
// word id, because 'love' has two synsets and
// 'like', a synonym of 'love' only has one synset
// and they end up having different minimum synonym
// word ids!!!
if ( dups.isInTable ( &aid64 ) ) break;
// add it. this could fail!
if ( ! dups.addKey ( &aid64 ) ) return false;
// set it?
if ( aid64 >= min ) continue;
// got a new min
min = aid64;
//minWordPtr = syn.m_termPtrs[j];
//minWordLen = syn.m_termLens[j];
// get largest term freq of all synonyms
//long pop2 = g_speller.getPhrasePopularity(NULL,aid64,
// true,langId);
//if ( pop2 > pop ) pop = pop2;
}
// early break out means a hit in dups table
if ( j < naids ) {
gotdup:
// do not count as repeat if query stop word
// because they often repeat
if ( isQueryStop ) continue;
// count # of repeated word forms
//nrwf++;
continue;
}
// hash that now
// do not include stop words in synbasehash so
// 'search the web' != 'search web'
if ( ! isStop ) {
// no! make it order independent so 'search the web'
// equals 'web the search' and 'engine search'
// equals 'search engine'
//baseHash64 <<= 1LL;
baseHash64 ^= min;
}
// count it, but only if not a query stop word like "and"
// or "the" or "a". # of unique word forms.
//.........这里部分代码省略.........
示例2: sendTurkPageReply
//.........这里部分代码省略.........
// * remove all imgs. just src them to dev null.
// * allow for entering a custom title for an event or all events
// that are or will ever appear on the page.
// * when displaying the text of the events, use hyphens to
// delineate the section topology. strike out text as a section
// fence is activated.
// * when a section is activated is it easier to just redownload
// the whole text of the page? maybe just the text frame?
// * clicking on an individual sentence section should just remove
// that sentence. that is kinda a special content hash removal
// tag. like "Click here for video."
// * when an event id is selected i guess activate its bgcolor to
// be light blue for all sentences currently in the event that
// are not in activated sections. (make exception for designated
// title sections). so we need multiple tags for each events
// sentence div section. if sentence is split use multiple div tags
// then to keep the order. so each event sentence would have
// <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and
// 10. that way we can activate it when one of those event ids is
// activated.
SafeBuf sb;
// int16_tcuts
if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
Words *words = &xd->m_words;
int32_t nw = words->getNumWords();
char **wptrs = words->getWords();
int32_t *wlens = words->getWordLens();
nodeid_t *tids = words->getTagIds();
// a special array for printing </div> tags
char *endCounts = (char *)mcalloc ( nw ,"endcounts");
if ( ! endCounts ) return sendErrorReply ( st , g_errno );
//
// now loop over all the words. if word starts a section that has
// SEC_CONTROL bit set, and print out the section hash and a color
// tag to be activated if the turkey activates us.
// CAUTION: word may start multiple sections.
//
for ( int32_t i = 0 ; i < nw ; i++ ) {
// get section ptr
Section *sj = ss->m_sectionPtrs[i];
// sanity check. sj must be first section ptr that starts @ a
if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) {
char *xx=NULL;*xx=0; }
// . does word #i start a section?
// . if section is control, print out the control
while ( sj && sj->m_a == i ) {
// print this section's hash
if ( sj->m_flags & SEC_CONTROL) {
// after the turkeys have made all the edits
// they need to submit the changes they made.
// how can we get that data sent back to the
// back end? we need to send back the colors
// of the sections that have been activated
// i guess. just do a loop over them.
示例3: getBestWindow
// . return the score of the highest-scoring window containing match #m
// . window is defined by the half-open interval [a,b) where a and b are
// word #'s in the Words array indicated by match #m
// . return -1 and set g_errno on error
int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta,
int32_t *besta, int32_t *bestb, char *gotIt,
char *retired, int32_t maxExcerptLen ) {
// get the window around match #mm
Match *m = &matches->m_matches[mm];
// what is the word # of match #mm?
int32_t matchWordNum = m->m_wordNum;
// what Words/Pos/Bits classes is this match in?
Words *words = m->m_words;
Section **sp = NULL;
int32_t *pos = m->m_pos->m_pos;
// use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses
const swbit_t *bb = m->m_bits->m_swbits;
// shortcut
if ( m->m_sections ) {
sp = m->m_sections->m_sectionPtrs;
}
int32_t nw = words->getNumWords();
int64_t *wids = words->getWordIds();
nodeid_t *tids = words->getTagIds();
// . sanity check
// . this prevents a core i've seen
if ( matchWordNum >= nw ) {
log("summary: got overflow condition for q=%s",m_q->m_orig);
// assume no best window
*besta = -1;
*bestb = -1;
*lasta = matchWordNum;
return 0;
}
// . we NULLify the section ptrs if we already used the word in another summary.
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
// assume no best window
*besta = -1;
*bestb = -1;
*lasta = matchWordNum;
return 0;
}
// . "a" is the left fence post of the window (it is a word # in Words)
// . go to the left as far as we can
// . thus we decrement "a"
int32_t a = matchWordNum;
// "posa" is the character position of the END of word #a
int32_t posa = pos[a+1];
int32_t firstFrag = -1;
bool startOnQuote = false;
bool goodStart = false;
int32_t wordCount = 0;
// . decrease "a" as int32_t as we stay within maxNumCharsPerLine
// . avoid duplicating windows by using "lasta", the last "a" of the
// previous call to getBestWindow(). This can happen if our last
// central query term was close to this one.
for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) {
// . don't include any "dead zone",
// . dead zones have already been used for the summary, and
// we are getting a second/third/... excerpt here now then
// stop if its the start of a sentence, too
// stop before title word
if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) {
goodStart = true;
break;
}
// don't go beyond an LI, TR, P tag
if ( tids && ( tids[a-1] == TAG_LI ||
tids[a-1] == TAG_TR ||
tids[a-1] == TAG_P ||
tids[a-1] == TAG_DIV ) ) {
goodStart = true;
break;
}
// stop if its the start of a quoted sentence
if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) &&
words->getWord(a)[0] == '\"' ){
startOnQuote = true;
goodStart = true;
break;
}
// find out the first instance of a fragment (comma, etc)
// watch out! because frag also means 's' in there's
if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) {
firstFrag = a;
//.........这里部分代码省略.........
示例4: setTitle
// returns false and sets g_errno on error
bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, Query *query,
LinkInfo *linkInfo, Url *firstUrl, const char *filteredRootTitleBuf, int32_t filteredRootTitleBufSize,
uint8_t contentType, uint8_t langId, int32_t niceness ) {
// make Msg20.cpp faster if it is just has
// Msg20Request::m_setForLinkInfo set to true, no need to extricate a title.
if ( maxTitleLen <= 0 ) {
return true;
}
m_niceness = niceness;
m_maxTitleLen = maxTitleLen;
// if this is too big the "first line" algo can be huge!!!
// and really slow everything way down with a huge title candidate
int32_t maxTitleWords = 128;
// assume no title
reset();
int32_t NW = words->getNumWords();
//
// now get all the candidates
//
// . allow up to 100 title CANDIDATES
// . "as" is the word # of the first word in the candidate
// . "bs" is the word # of the last word IN the candidate PLUS ONE
int32_t n = 0;
int32_t as[MAX_TIT_CANDIDATES];
int32_t bs[MAX_TIT_CANDIDATES];
float scores[MAX_TIT_CANDIDATES];
Words *cptrs[MAX_TIT_CANDIDATES];
int32_t types[MAX_TIT_CANDIDATES];
int32_t parent[MAX_TIT_CANDIDATES];
// record the scoring algos effects
float baseScore [MAX_TIT_CANDIDATES];
float noCapsBoost [MAX_TIT_CANDIDATES];
float qtermsBoost [MAX_TIT_CANDIDATES];
float inCommonCandBoost[MAX_TIT_CANDIDATES];
// reset these
for ( int32_t i = 0 ; i < MAX_TIT_CANDIDATES ; i++ ) {
// assume no parent
parent[i] = -1;
}
// xml and words class for each link info, rss item
Xml tx[MAX_TIT_CANDIDATES];
Words tw[MAX_TIT_CANDIDATES];
int32_t ti = 0;
// restrict how many link texts and rss blobs we check for titles
// because title recs like www.google.com have hundreds and can
// really slow things down to like 50ms for title generation
int32_t kcount = 0;
int32_t rcount = 0;
//int64_t x = gettimeofdayInMilliseconds();
// . get every link text
// . TODO: repeat for linkInfo2, the imported link text
for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) {
// breathe
QUICKPOLL(m_niceness);
// fast skip check for link text
if ( k->size_linkText >= 3 && ++kcount >= 20 ) continue;
// fast skip check for rss item
if ( k->size_rssItem > 10 && ++rcount >= 20 ) continue;
// set Url
Url u;
u.set( k->getUrl(), k->size_urlBuf );
// is it the same host as us?
bool sh = true;
// skip if not from same host and should be
if ( firstUrl->getHostLen() != u.getHostLen() ) {
sh = false;
}
// skip if not from same host and should be
if ( strncmp( firstUrl->getHost(), u.getHost(), u.getHostLen() ) ) {
sh = false;
}
// get the link text
if ( k->size_linkText >= 3 ) {
char *p = k->getLinkText();
int32_t plen = k->size_linkText - 1;
if ( ! verifyUtf8 ( p , plen ) ) {
log("title: set4 bad link text from url=%s", k->getUrl());
continue;
}
// now the words.
if ( !tw[ti].set( k->getLinkText(), k->size_linkText - 1, true, 0 ) ) {
//.........这里部分代码省略.........
示例5: setSummary
//.........这里部分代码省略.........
// we got a new winner
maxi = i;
maxa = a;
maxb = b;
maxScore = score;
// save this too
gbmemcpy ( maxGotIt , gotIt , m_q->m_numWords );
}
// retire the query words in the winning summary
//log( LOG_WARN,"summary: took %" PRId64" ms to finish getbestwindo",
// gettimeofdayInMilliseconds() - stget );
// all done if no winner was made
if ( maxi == -1 || maxa == -1 || maxb == -1) {
break;
}
// who is the winning match?
maxm = &matches->m_matches[maxi];
Words *ww = maxm->m_words;
// we now use "m_swbits" for the summary bits since they are
// of size sizeof(swbit_t), a int16_t at this point
swbit_t *bb = maxm->m_bits->m_swbits;
// this should be impossible
if ( maxa > ww->getNumWords() || maxb > ww->getNumWords() ) {
log ( LOG_WARN,"query: summary starts or ends after "
"document is over! maxa=%" PRId32" maxb=%" PRId32" nw=%" PRId32,
maxa, maxb, ww->getNumWords() );
maxa = ww->getNumWords() - 1;
maxb = ww->getNumWords();
}
// assume we do not preceed with ellipsis "..."
bool needEllipsis = true;
const char *c = ww->getWord(maxa)+0;
// rule of thumb, don't use ellipsis if the first letter is capital, or a non letter
// is punct word before us pair acrossable? if so then we probably are not the start of a sentence.
// or if into the sample and previous excerpt had an ellipsis do not bother using one for us.
if ( !is_alpha_utf8(c) || is_upper_utf8(c) ||
(bb[maxa] & D_STARTS_SENTENCE) ||
(p > m_summary && hadEllipsis)) {
needEllipsis = false;
}
if ( needEllipsis ) {
// break out if no room for "..."
if ( p + 4 + 2 > pend ) {
break;
}
// space first?
if ( p > m_summary ) {
*p++ = ' ';
}