当前位置: 首页>>代码示例>>C++>>正文


C++ Words::getNumWords方法代码示例

本文整理汇总了C++中Words::getNumWords方法的典型用法代码示例。如果您正苦于以下问题:C++ Words::getNumWords方法的具体用法?C++ Words::getNumWords怎么用?C++ Words::getNumWords使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Words的用法示例。


在下文中一共展示了Words::getNumWords方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1:

// langId is language of the query
long long getSynBaseHash64 ( char *qstr , uint8_t langId ) {
	Words ww;
	ww.set3 ( qstr );
	long nw = ww.getNumWords();
	long long *wids = ww.getWordIds();
	//char **wptrs = ww.getWords();
	//long *wlens = ww.getWordLens();
	long long baseHash64 = 0LL;
	Synonyms syn;
	// assume english if unknown to fix 'pandora's tower'
	// vs 'pandoras tower' where both words are in both
	// english and german so langid is unknown
	if ( langId == langUnknown ) langId = langEnglish;
	// . store re-written query into here then hash that string
	// . this way we can get rid of spaces
	//char rebuf[1024];
	//char *p = rebuf;
	//if ( strstr(qstr,"cheatcodes") )
	//	log("hey");
	// for deduping
	HashTableX dups;
	if ( ! dups.set ( 8,0,1024,NULL,0,false,0,"qhddup") ) return false;
	// scan the words
	for ( long i = 0 ; i < nw ; i++ ) {
		// skip if not alnum
		if ( ! wids[i] ) continue;
		// get its synonyms into tmpBuf
		char tmpBuf[TMPSYNBUFSIZE];
		// . assume niceness of 0 for now
		// . make sure to get all synsets!! ('love' has two synsets)
		long naids = syn.getSynonyms (&ww,i,langId,tmpBuf,0);
		// term freq algo
		//long pop = g_speller.getPhrasePopularity(NULL,
		//					 wids[i],
		//					 true,
		//					 langId);
		// is it a queryStopWord like "the" or "and"?
		bool isQueryStop = ::isQueryStopWord(NULL,0,wids[i]);
		// a more restrictive list
		bool isStop = ::isStopWord(NULL,0,wids[i]);
		if ( ::isCommonQueryWordInEnglish(wids[i]) ) isStop = true;
		// find the smallest one
		unsigned long long min = wids[i];
		//char *minWordPtr = wptrs[i];
		//long  minWordLen = wlens[i];
		// declare up here since we have a goto below
		long j;
		// add to table too
		if ( dups.isInTable ( &min ) ) goto gotdup;
		// add to it
		if ( ! dups.addKey ( &min ) ) return false;
		// now scan the synonyms, they do not include "min" in them
		for ( j = 0 ; j < naids ; j++ ) {
			// get it
			unsigned long long aid64;
			aid64 = (unsigned long long)syn.m_aids[j];
			// if any syn already hashed then skip it and count
			// as a repeated term. we have to do it this way
			// rather than just getting the minimum synonym 
			// word id, because 'love' has two synsets and
			// 'like', a synonym of 'love' only has one synset
			// and they end up having different minimum synonym
			// word ids!!!
			if ( dups.isInTable ( &aid64 ) ) break;
			// add it. this could fail!
			if ( ! dups.addKey ( &aid64 ) ) return false;
			// set it?
			if ( aid64 >= min ) continue;
			// got a new min
			min = aid64;
			//minWordPtr = syn.m_termPtrs[j];
			//minWordLen = syn.m_termLens[j];
			// get largest term freq of all synonyms
			//long pop2 = g_speller.getPhrasePopularity(NULL,aid64,
			//					  true,langId);
			//if ( pop2 > pop ) pop = pop2;
		}
		// early break out means a hit in dups table
		if ( j < naids ) {
		gotdup:
			// do not count as repeat if query stop word
			// because they often repeat
			if ( isQueryStop ) continue;
			// count # of repeated word forms
			//nrwf++;
			continue;
		}
		// hash that now
		// do not include stop words in synbasehash so
		// 'search the web' != 'search web'
		if ( ! isStop ) {
			// no! make it order independent so 'search the web'
			// equals 'web the search' and 'engine search'
			// equals 'search engine'
			//baseHash64 <<= 1LL;
			baseHash64 ^= min;
		}
		// count it, but only if not a query stop word like "and"
		// or "the" or "a". # of unique word forms.
//.........这里部分代码省略.........
开发者ID:BillWangCS,项目名称:open-source-search-engine,代码行数:101,代码来源:Synonyms.cpp

示例2: sendTurkPageReply


//.........这里部分代码省略.........

	// * remove all imgs. just src them to dev null.

	// * allow for entering a custom title for an event or all events
	//   that are or will ever appear on the page. 

	// * when displaying the text of the events, use hyphens to
	//   delineate the section topology. strike out text as a section
	//   fence is activated.

	// * when a section is activated is it easier to just redownload
	//   the whole text of the page? maybe just the text frame?

	// * clicking on an individual sentence section should just remove
	//   that sentence. that is kinda a special content hash removal
	//   tag. like "Click here for video."

	// * when an event id is selected i guess activate its bgcolor to
	//   be light blue for all sentences currently in the event that
	//   are not in activated sections. (make exception for designated 
	//   title sections). so we need multiple tags for each events
	//   sentence div section. if sentence is split use multiple div tags
	//   then to keep the order. so each event sentence would have 
	//   <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and
	//   10. that way we can activate it when one of those event ids is
	//   activated.


	SafeBuf sb;

	// int16_tcuts
	if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
	Words     *words = &xd->m_words;
	int32_t       nw    = words->getNumWords();
	char     **wptrs = words->getWords();
	int32_t      *wlens = words->getWordLens();
	nodeid_t  *tids  = words->getTagIds();

	// a special array for printing </div> tags
	char *endCounts = (char *)mcalloc ( nw ,"endcounts");
	if ( ! endCounts ) return sendErrorReply ( st , g_errno );


	// 
	// now loop over all the words. if word starts a section that has
	// SEC_CONTROL bit set, and print out the section hash and a color
	// tag to be activated if the turkey activates us.
	// CAUTION: word may start multiple sections.
	//
	for ( int32_t i = 0 ; i < nw ; i++ ) { 
		// get section ptr
		Section *sj = ss->m_sectionPtrs[i];
		// sanity check. sj must be first section ptr that starts @ a
		if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) {
			char *xx=NULL;*xx=0; }
		// . does word #i start a section?
		// . if section is control, print out the control
		while ( sj && sj->m_a == i ) {
			// print this section's hash
			if ( sj->m_flags & SEC_CONTROL) {
				// after the turkeys have made all the edits
				// they need to submit the changes they made.
				// how can we get that data sent back to the
				// back end? we need to send back the colors
				// of the sections that have been activated
				// i guess. just do a loop over them.
开发者ID:DeadNumbers,项目名称:open-source-search-engine,代码行数:67,代码来源:Turkdb.cpp

示例3: getBestWindow

// . return the score of the highest-scoring window containing match #m
// . window is defined by the half-open interval [a,b) where a and b are 
//   word #'s in the Words array indicated by match #m
// . return -1 and set g_errno on error
int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta,
                                 int32_t *besta, int32_t *bestb, char *gotIt,
                                 char *retired, int32_t maxExcerptLen ) {
	// get the window around match #mm
	Match *m = &matches->m_matches[mm];

	// what is the word # of match #mm?
	int32_t matchWordNum = m->m_wordNum;

	// what Words/Pos/Bits classes is this match in?
	Words *words = m->m_words;
	Section **sp = NULL;
	int32_t *pos = m->m_pos->m_pos;

	// use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses
	const swbit_t *bb = m->m_bits->m_swbits;

	// shortcut
	if ( m->m_sections ) {
		sp = m->m_sections->m_sectionPtrs;
	}

	int32_t nw = words->getNumWords();
	int64_t *wids = words->getWordIds();
	nodeid_t *tids = words->getTagIds();

	// . sanity check
	// . this prevents a core i've seen
	if ( matchWordNum >= nw ) {
		log("summary: got overflow condition for q=%s",m_q->m_orig);

		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . we NULLify the section ptrs if we already used the word in another summary.
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
	if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . "a" is the left fence post of the window (it is a word # in Words)
	// . go to the left as far as we can 
	// . thus we decrement "a"
	int32_t a = matchWordNum;

	// "posa" is the character position of the END of word #a
	int32_t posa = pos[a+1];
	int32_t firstFrag = -1;
	bool startOnQuote = false;
	bool goodStart = false;
	int32_t wordCount = 0;

	// . decrease "a" as int32_t as we stay within maxNumCharsPerLine
	// . avoid duplicating windows by using "lasta", the last "a" of the
	//   previous call to getBestWindow(). This can happen if our last
	//   central query term was close to this one.
	for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) {
		// . don't include any "dead zone", 
		// . dead zones have already been used for the summary, and
		//   we are getting a second/third/... excerpt here now then
		// stop if its the start of a sentence, too
		// stop before title word
		if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) {
			goodStart = true;
			break;
		}

		// don't go beyond an LI, TR, P tag
		if ( tids && ( tids[a-1] == TAG_LI ||
		               tids[a-1] == TAG_TR ||
		               tids[a-1] == TAG_P  ||
		               tids[a-1] == TAG_DIV ) ) {
			goodStart = true;
			break;
		}

		// stop if its the start of a quoted sentence
		if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && 
		     words->getWord(a)[0] == '\"' ){
			startOnQuote = true;
			goodStart    = true;
			break;
		}

		// find out the first instance of a fragment (comma, etc)
		// watch out! because frag also means 's' in there's
		if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) {
			firstFrag = a;
//.........这里部分代码省略.........
开发者ID:exename,项目名称:open-source-search-engine,代码行数:101,代码来源:Summary.cpp

示例4: setTitle

// returns false and sets g_errno on error
bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, Query *query,
                       LinkInfo *linkInfo, Url *firstUrl, const char *filteredRootTitleBuf, int32_t filteredRootTitleBufSize,
                       uint8_t contentType, uint8_t langId, int32_t niceness ) {
	// make Msg20.cpp faster if it is just has
	// Msg20Request::m_setForLinkInfo set to true, no need to extricate a title.
	if ( maxTitleLen <= 0 ) {
		return true;
	}

	m_niceness = niceness;
	m_maxTitleLen = maxTitleLen;

	// if this is too big the "first line" algo can be huge!!!
	// and really slow everything way down with a huge title candidate
	int32_t maxTitleWords = 128;

	// assume no title
	reset();

	int32_t NW = words->getNumWords();

	//
	// now get all the candidates
	//

	// . allow up to 100 title CANDIDATES
	// . "as" is the word # of the first word in the candidate
	// . "bs" is the word # of the last word IN the candidate PLUS ONE
	int32_t n = 0;
	int32_t as[MAX_TIT_CANDIDATES];
	int32_t bs[MAX_TIT_CANDIDATES];
	float scores[MAX_TIT_CANDIDATES];
	Words *cptrs[MAX_TIT_CANDIDATES];
	int32_t types[MAX_TIT_CANDIDATES];
	int32_t parent[MAX_TIT_CANDIDATES];

	// record the scoring algos effects
	float  baseScore        [MAX_TIT_CANDIDATES];
	float  noCapsBoost      [MAX_TIT_CANDIDATES];
	float  qtermsBoost      [MAX_TIT_CANDIDATES];
	float  inCommonCandBoost[MAX_TIT_CANDIDATES];

	// reset these
	for ( int32_t i = 0 ; i < MAX_TIT_CANDIDATES ; i++ ) {
		// assume no parent
		parent[i] = -1;
	}

	// xml and words class for each link info, rss item
	Xml   tx[MAX_TIT_CANDIDATES];
	Words tw[MAX_TIT_CANDIDATES];
	int32_t  ti = 0;

	// restrict how many link texts and rss blobs we check for titles
	// because title recs like www.google.com have hundreds and can
	// really slow things down to like 50ms for title generation
	int32_t kcount = 0;
	int32_t rcount = 0;

	//int64_t x = gettimeofdayInMilliseconds();

	// . get every link text
	// . TODO: repeat for linkInfo2, the imported link text
	for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) {
		// breathe
		QUICKPOLL(m_niceness);
		// fast skip check for link text
		if ( k->size_linkText >= 3 && ++kcount >= 20 ) continue;
		// fast skip check for rss item
		if ( k->size_rssItem > 10 && ++rcount >= 20 ) continue;

		// set Url
		Url u;
		u.set( k->getUrl(), k->size_urlBuf );

		// is it the same host as us?
		bool sh = true;

		// skip if not from same host and should be
		if ( firstUrl->getHostLen() != u.getHostLen() ) {
			sh = false;
		}

		// skip if not from same host and should be
		if ( strncmp( firstUrl->getHost(), u.getHost(), u.getHostLen() ) ) {
			sh = false;
		}

		// get the link text
		if ( k->size_linkText >= 3 ) {
			char *p    = k->getLinkText();
			int32_t  plen = k->size_linkText - 1;
			if ( ! verifyUtf8 ( p , plen ) ) {
				log("title: set4 bad link text from url=%s", k->getUrl());
				continue;
			}

			// now the words.
			if ( !tw[ti].set( k->getLinkText(), k->size_linkText - 1, true, 0 ) ) {
//.........这里部分代码省略.........
开发者ID:lemire,项目名称:open-source-search-engine,代码行数:101,代码来源:Title.cpp

示例5: setSummary


//.........这里部分代码省略.........

			// we got a new winner
			maxi     = i;
			maxa     = a;
			maxb     = b;
			maxScore = score;

			// save this too
			gbmemcpy ( maxGotIt , gotIt , m_q->m_numWords );

		}
	
		// retire the query words in the winning summary

		
		//log( LOG_WARN,"summary: took %" PRId64" ms to finish getbestwindo",
		//    gettimeofdayInMilliseconds() - stget );


		// all done if no winner was made
		if ( maxi == -1 || maxa == -1 || maxb == -1) {
			break;
		}

		// who is the winning match?
		maxm = &matches->m_matches[maxi];
		Words *ww = maxm->m_words;

		// we now use "m_swbits" for the summary bits since they are
		// of size sizeof(swbit_t), a int16_t at this point
		swbit_t *bb = maxm->m_bits->m_swbits;

		// this should be impossible
		if ( maxa > ww->getNumWords() || maxb > ww->getNumWords() ) {
			log ( LOG_WARN,"query: summary starts or ends after "
			      "document is over! maxa=%" PRId32" maxb=%" PRId32" nw=%" PRId32,
			      maxa, maxb, ww->getNumWords() );
			maxa = ww->getNumWords() - 1;
			maxb = ww->getNumWords();
		}

		// assume we do not preceed with ellipsis "..."
		bool needEllipsis = true;
		
		const char *c = ww->getWord(maxa)+0;

		// rule of thumb, don't use ellipsis if the first letter is capital, or a non letter
		// is punct word before us pair acrossable? if so then we probably are not the start of a sentence.
		// or if into the sample and previous excerpt had an ellipsis do not bother using one for us.
		if ( !is_alpha_utf8(c) || is_upper_utf8(c) ||
		     (bb[maxa] & D_STARTS_SENTENCE) ||
		     (p > m_summary && hadEllipsis)) {
			needEllipsis = false;
		}

		if ( needEllipsis ) {
			// break out if no room for "..."
			if ( p + 4 + 2 > pend ) {
				break;
			}

			// space first?
			if ( p > m_summary ) {
				*p++ = ' ';
			}
开发者ID:exename,项目名称:open-source-search-engine,代码行数:66,代码来源:Summary.cpp


注:本文中的Words::getNumWords方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。