本文整理汇总了C++中QUICKPOLL函数的典型用法代码示例。如果您正苦于以下问题:C++ QUICKPOLL函数的具体用法?C++ QUICKPOLL怎么用?C++ QUICKPOLL使用的例子?那么, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了QUICKPOLL函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: countWords
// a quickie
// this url gives a m_preCount that is too low. why?
// http://go.tfol.com/163/speed.asp
long countWords ( char *p , long plen , long niceness ) {
char *pend = p + plen;
long count = 1;
loop:
// sequence of punct
for ( ; p < pend && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) {
// breathe
QUICKPOLL ( niceness );
// in case being set from xml tags, count as words now
if ( *p=='<') count++;
}
count++;
// sequence of alnum
for ( ; p < pend && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) )
// breathe
QUICKPOLL ( niceness );
count++;
if ( p < pend ) goto loop;
// some extra for good meaure
return count+10;
}
示例2: makeKey
// . ***** META LIST DELETE LOOP *****
// . scan for meta lists to remove from syncdb
// . check every D KEY
// . must NOT have any "need to send request" keys (a bit set)
// . must NOT have any "need to recv request" keys (b bit set)
// . must NOT have our "need to add" key (c bit set)
void Syncdb::loop3 ( ) {
// . loop over the meta lists we need to delete
// . these are "d" keys
// . use a "tid" of 0
key128_t sk = makeKey ( 0,0,0,1,0,0,0,0 );
key128_t ek = makeKey ( 0,0,0,1,0,0xffffffff,0xffffffffffffffffLL,1 );
// get the first node in sequence, if any
long nn = m_qt.getNextNode ( 0 , (char *)&sk );
// do the loop
for ( ; nn >= 0 ; nn = m_qt.getNextNode ( nn ) ) {
// breathe
QUICKPOLL ( MAX_NICENESS );
// get key
key128_t k = *(key128_t *)m_qt.getKey ( nn );
// stop when we hit the end
if ( k > ek ) break;
// get zid
uint64_t zid = getZid ( &k );
// get sid
uint32_t sid = getSid ( &k );
// have we sent/recvd all checkoff requests required? have
// we added the meta list? if so, we can nuke it from syncdb
if ( ! canDeleteMetaList ( sid, zid ) ) {
// no use banging away at this sid any more since we
// are missing another action for this one
sid++;
// find the key of the FIRST meta list we need to add
// for this new senderId, "sid"
key128_t nk = makeKey ( 0,0,0,1,0,sid,0,0 );
// undo the m_qt.getNextNode(nn) we call in for loop
nn = m_qt.getPrevNode ( 0 , (char *)&nk );
// sanity check
if ( nn < 0 ) { char *xx=NULL;*xx=0; }
// get next key from this new sid
continue;
}
// . make the negative key for syncdb
// . it just uses a negative "c" key, with a tid of 0
key128_t dk = makeKey ( 0,0,1,0,0,sid,zid,0);
// . add it to syncdb to signifiy a delete
// . this returns false and sets g_errno on error
if(!m_rdb.addRecord((collnum_t)0,(char *)&dk,NULL,0,
MAX_NICENESS)) return;
// delete it from quick tree now that we added the negative
// key successfully to syncdb
long dn = m_qt.getNode ( 0, (char *)&k );
// must be there!
if ( ! dn ) { char *xx=NULL;*xx=0; }
// nuke it
m_qt.deleteNode ( dn , true );
}
// . success
// . do not recall until big loop completes a round
m_calledLoop3 = true;
}
示例3: logTrace
// . returns false and sets g_errno on error
// . we are responsible for freeing reply/replySize
void Msg0::gotReply ( char *reply , int32_t replySize , int32_t replyMaxSize ) {
logTrace( g_conf.m_logTraceMsg0, "BEGIN" );
// timing debug
if ( g_conf.m_logTimingNet && m_rdbId==RDB_POSDB && m_startTime > 0 )
log(LOG_TIMING,"net: msg0: Got termlist, termId=%" PRIu64". "
"Took %" PRId64" ms, replySize=%" PRId32" (niceness=%" PRId32").",
g_posdb.getTermId ( m_startKey ) ,
gettimeofdayInMilliseconds()-m_startTime,
replySize,m_niceness);
// TODO: insert some seals for security, may have to alloc
// separate space for the list then
// set the list w/ the remaining data
QUICKPOLL(m_niceness);
m_list->set ( reply ,
replySize ,
reply , // alloc buf begins here, too
replyMaxSize ,
m_startKey ,
m_endKey ,
m_fixedDataSize ,
true , // ownData?
m_useHalfKeys ,
m_ks );
// return now if we don't add to cache
//if ( ! m_addToCache ) return;
//
// add posdb list to termlist cache
//
//if ( m_rdbId != RDB_POSDB ) return;
// add to LOCAL termlist cache
//addToTermListCache(m_coll,m_startKey,m_endKey,m_list);
// ignore any error adding to cache
//g_errno = 0;
// . NO! no more network caching, we got gigabit... save space
// for our disk, no replication, man, mem is expensive
// . throw the just the list into the net cache
// . addToNetCache() will copy it for it's own
// . our current copy should be freed by the user's callback somewhere
// . grab our corresponding rdb's local cache
// . we'll use it to store this list since there's no collision chance
//RdbCache *cache = m_rdb->getCache ();
// . add the list to this cache
// . returns false and sets g_errno on error
// . will not be added if cannot copy the data
//cache->addList ( m_startKey , m_list ) ;
// reset g_errno -- we don't care if cache coulnd't add it
//g_errno = 0;
logTrace( g_conf.m_logTraceMsg0, "END" );
}
示例4: while
// after you read/write from/to disk, copy into the page cache
void DiskPageCache::addPages ( long vfd,
char *buf,
long numBytes,
long long offset ,
long niceness ){
// check for override function
//if ( m_isOverriden ) {
// m_addPages2 ( this,
// vfd,
// buf,
// numBytes,
// offset );
// return;
//}
// if vfd is -1, then we were not able to add a map for this file
if ( vfd < 0 ) return;
// no NULL ptrs
if ( ! buf ) return;
// return if no pages allowed in page cache
if ( m_maxMemOff == 0 ) return;
// or disabled
if ( ! m_enabled ) return;
// disabled at the master controls?
if ( m_switch && ! *m_switch ) return;
// sometimes the file got unlinked on us
if ( ! m_memOff[vfd] ) return;
// what is the page range?
long long sp = offset / m_pageSize ;
// point to it
char *bufPtr = buf;
char *bufEnd = buf + numBytes;
// . do not add first page unless right on the boundary
// . how much did we exceed the boundary by?
oldshort skip = offset - sp * m_pageSize ;
long size = m_pageSize - skip;
// now add the remaining pages
while ( bufPtr < bufEnd ) {
// breathe
QUICKPOLL(niceness);
// ensure "size" is not too big
if ( bufPtr + size > bufEnd ) size = bufEnd - bufPtr;
// add the page to memory
addPage ( vfd , sp , bufPtr , size , skip );
// advance
bufPtr += size;
sp++;
size = m_pageSize;
skip = 0;
}
}
示例5: while
// . returns true if document is adult, false otherwise
bool AdultBit::getBit ( char *s , int32_t niceness) {
// rudimentary adult detection algorithm
int32_t i = 0;
int32_t dirties = 0;
int32_t j;
int32_t slen;
loop:
// skip until we hit an alpha
while ( s[i] && ! is_alpha_a(s[i]) ) i++;
// return if done
if ( ! s[i] ) return false;
// . point to char after this alpha
// . return if none
j = i + 1;
// find end of the alpha char sequence
while ( s[j] && is_alpha_a(s[j]) ) j++;
// skip over 1 or 2 letter words
slen = j - i;
if ( slen <= 2 ) { i = j; goto loop; }
// it's adult content if it has just 1 obscene word
if ( isObscene ( (char *) s+i , slen ) ) return true;
// W = non-dirty word
// D = dirty word
// . = sequence of punctuation/num and/or 1 to 2 letter words
// dirty sequences:
// . D . D . D . (dirties=6)
// . D . W . D . D . (dirties=5)
// . basically, if 3 out of 4 words in a subsequence are
// "dirty" then the whole document is "adult" content
if ( isDirty ( (char *) s+i , slen ) ) {
dirties += 2;
if ( dirties >= 5 ) return true;
i = j;
goto loop;
}
dirties--;
if ( dirties < 0 ) dirties = 0;
QUICKPOLL((niceness));
i = j;
goto loop;
}
示例6: getStatState
// . return false with g_errno set on error, true otherwise
// . looking at the number of points per second
// . average query latency for last 20 queries
// . average disk bytes read for last 20 accesses
// . val is the State::m_value measurement, a float
// . also each point may represent a number of bytes transferred in which
// case we use that number rather than "1", which is the default
bool Statsdb::addPointsFromList ( Label *label ) {
StatState *ss = getStatState ( label->m_graphHash );
// return false with g_errno set
if ( ! ss ) return false;
m_list.resetListPtr();
// scan the list for our junk
for ( ; ! m_list.isExhausted() ; m_list.skipCurrentRecord() ) {
// breathe
QUICKPOLL(m_niceness);
// get that
StatKey *sk = (StatKey *)m_list.getCurrentRec();
// and data
StatData *sd = (StatData *)m_list.getCurrentData();
// must be a "query" stat
if ( sk->m_labelHash != label->m_labelHash ) continue;
// add that
addPoint ( sk , sd , ss , label );
}
return true;
}
示例7: QUICKPOLL
//
// . add EventPoints to m_sb3/m_ht3
// . these basically represent binary events or parm state changes
// . i.e. "a merge operation"
// . i.e. "changing a parm value"
//
bool Statsdb::addEventPointsFromList ( ) {
m_list.resetListPtr();
// scan the list for our junk
for ( ; ! m_list.isExhausted() ; m_list.skipCurrentRecord() ) {
// breathe
QUICKPOLL(m_niceness);
// get that
StatKey *sk = (StatKey *)m_list.getCurrentRec();
// and data
StatData *sd = (StatData *)m_list.getCurrentData();
// must be an "event" stat... i.e. a status change
if ( ! sd->isEvent() ) continue;
// make sure to stack lines so they do not touch
// each other...
if ( ! addEventPoint ( sk->m_time1 ,
sk->m_labelHash , // parmHash
sd->getOldVal () ,
sd->getNewVal () ,
10 )) // thickness
return false;
}
return true;
}
示例8: ask
//.........这里部分代码省略.........
// . groupMask must turn on higher bits first (count downwards kinda)
// . titledb and spiderdb use special masks to get groupId
// if diffbot.cpp is reading spiderdb from each shard we have to
// get groupid from hostid here lest we core in getGroupId() below.
// it does that for dumping spiderdb to the client browser. they
// can download the whole enchilada.
if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB )
m_shardNum = 0;
// did they force it? core until i figure out what this is
else if ( forceParitySplit >= 0 )
//m_groupId = g_hostdb.getGroupId ( forceParitySplit );
m_shardNum = forceParitySplit;
else
//m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit );
m_shardNum = getShardNum ( m_rdbId , startKey );
// if we are looking up a termlist in posdb that is split by termid and
// not the usual docid then we have to set this posdb key bit that tells
// us that ...
if ( noSplit && m_rdbId == RDB_POSDB )
m_shardNum = g_hostdb.getShardNumByTermId ( startKey );
// how is this used?
if ( forceLocalIndexdb ) m_shardNum = getMyShardNum();
// if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%" PRId32"]", __FILE__,__func__, __LINE__, m_shardNum);
// . store these parameters
// . get a handle to the rdb in case we can satisfy locally
// . returns NULL and sets g_errno on error
QUICKPOLL((m_niceness));
Rdb *rdb = getRdbFromId ( m_rdbId );
if ( ! rdb ) return true;
// we need the fixedDataSize
m_fixedDataSize = rdb->getFixedDataSize();
m_useHalfKeys = rdb->useHalfKeys();
// . debug msg
// . Msg2 does this when checking for a cached compound list.
// compound lists do not actually exist, they are merges of smaller
// UOR'd lists.
if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) {
log( LOG_LOGIC, "net: msg0: Weird. check but don't add... rdbid=%" PRId32".", ( int32_t ) m_rdbId );
}
// set this here since we may not call msg5 if list not local
//m_list->setFixedDataSize ( m_fixedDataSize );
// . now that we do load balancing we don't want to do a disk lookup
// even if local if we are merging or dumping
// . UNLESS g_conf.m_preferLocalReads is true
if ( preferLocalReads == -1 )
preferLocalReads = g_conf.m_preferLocalReads;
// . always prefer local for full split clusterdb
// . and keep the tfndb/titledb lookups in the same stripe
// . so basically we can't do biased caches if fully split
//if ( g_conf.m_fullSplit ) preferLocalReads = true;
preferLocalReads = true;
// it it stored locally?
bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId );
m_shardNum == getMyShardNum() );
// only do local lookups if this is true
示例9: gotListWrapper
// . slot should be auto-nuked upon transmission or error
// . TODO: ensure if this sendReply() fails does it really nuke the slot?
void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx ) {
// get the state
State00 *st0 = (State00 *)state;
// extract the udp slot and list and msg5
UdpSlot *slot = st0->m_slot;
RdbList *list = &st0->m_list;
Msg5 *msg5 = &st0->m_msg5;
UdpServer *us = st0->m_us;
// sanity check -- ensure they match
//if ( niceness != st0->m_niceness )
// log("Msg0: niceness mismatch");
// debug msg
//if ( niceness != 0 )
// log("HEY! niceness is not 0");
// timing debug
if ( g_conf.m_logTimingNet || g_conf.m_logDebugNet ) {
//log("Msg0:hndled request %"UINT64"",gettimeofdayInMilliseconds());
int32_t size = -1;
if ( list ) size = list->getListSize();
log(LOG_TIMING|LOG_DEBUG,
"net: msg0: Handled request for data. "
"Now sending data termId=%"UINT64" size=%"INT32""
" transId=%"INT32" ip=%s port=%i took=%"INT64" "
"(niceness=%"INT32").",
g_posdb.getTermId(msg5->m_startKey),
size,slot->m_transId,
iptoa(slot->m_ip),slot->m_port,
gettimeofdayInMilliseconds() - st0->m_startTime ,
st0->m_niceness );
}
// debug
//if ( ! msg5->m_includeTree )
// log("hotit\n");
// on error nuke the list and it's data
if ( g_errno ) {
mdelete ( st0 , sizeof(State00) , "Msg0" );
delete (st0);
// TODO: free "slot" if this send fails
us->sendErrorReply ( slot , g_errno );
return;
}
QUICKPOLL(st0->m_niceness);
// point to the serialized list in "list"
char *data = list->getList();
int32_t dataSize = list->getListSize();
char *alloc = list->getAlloc();
int32_t allocSize = list->getAllocSize();
// tell list not to free the data since it is a reply so UdpServer
// will free it when it destroys the slot
list->setOwnData ( false );
// keep track of stats
Rdb *rdb = getRdbFromId ( st0->m_rdbId );
if ( rdb ) rdb->sentReplyGet ( dataSize );
// TODO: can we free any memory here???
// keep track of how long it takes to complete the send
st0->m_startTime = gettimeofdayInMilliseconds();
// debug point
int32_t oldSize = msg5->m_minRecSizes;
int32_t newSize = msg5->m_minRecSizes + 20;
// watch for wrap around
if ( newSize < oldSize ) newSize = 0x7fffffff;
if ( dataSize > newSize && list->getFixedDataSize() == 0 &&
// do not annoy me with these linkdb msgs
dataSize > newSize+100 )
log(LOG_LOGIC,"net: msg0: Sending more data than what was "
"requested. Ineffcient. Bad engineer. dataSize=%"INT32" "
"minRecSizes=%"INT32".",dataSize,oldSize);
/*
// always compress these lists
if ( st0->m_rdbId == RDB_SECTIONDB ) { // && 1 == 3) {
// get sh48, the sitehash
key128_t *startKey = (key128_t *)msg5->m_startKey ;
int64_t sh48 = g_datedb.getTermId(startKey);
// debug
//log("msg0: got sectiondblist from disk listsize=%"INT32"",
// list->getListSize());
if ( dataSize > 50000 )
log("msg0: sending back list rdb=%"INT32" "
"listsize=%"INT32" sh48=0x%"XINT64"",
(int32_t)st0->m_rdbId,
dataSize,
sh48);
// save it
int32_t origDataSize = dataSize;
// store compressed list on itself
char *dst = list->m_list;
// warn if niceness is 0!
if ( st0->m_niceness == 0 )
log("msg0: compressing sectiondb list at niceness 0!");
// compress the list
uint32_t lastVoteHash32 = 0LL;
SectionVote *lastVote = NULL;
//.........这里部分代码省略.........
示例10: removeExpiredLocks
// hostId is the remote hostid sending us the lock request
void removeExpiredLocks ( int32_t hostId ) {
// when we last cleaned them out
static time_t s_lastTime = 0;
int32_t nowGlobal = getTimeGlobalNoCore();
// only do this once per second at the most
if ( nowGlobal <= s_lastTime ) return;
// shortcut
HashTableX *ht = &g_spiderLoop.m_lockTable;
restart:
// scan the slots
int32_t ns = ht->m_numSlots;
// . clean out expired locks...
// . if lock was there and m_expired is up, then nuke it!
// . when Rdb.cpp receives the "fake" title rec it removes the
// lock, only it just sets the m_expired to a few seconds in the
// future to give the negative doledb key time to be absorbed.
// that way we don't repeat the same url we just got done spidering.
// . this happens when we launch our lock request on a url that we
// or a twin is spidering or has just finished spidering, and
// we get the lock, but we avoided the negative doledb key.
for ( int32_t i = 0 ; i < ns ; i++ ) {
// breathe
QUICKPOLL(MAX_NICENESS);
// skip if empty
if ( ! ht->m_flags[i] ) continue;
// cast lock
UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i);
int64_t lockKey = *(int64_t *)ht->getKeyFromSlot(i);
// if collnum got deleted or reset
collnum_t collnum = lock->m_collnum;
if ( collnum >= g_collectiondb.m_numRecs ||
! g_collectiondb.m_recs[collnum] ) {
log("spider: removing lock from missing collnum "
"%" PRId32,(int32_t)collnum);
goto nuke;
}
// skip if not yet expired
if ( lock->m_expires == 0 ) continue;
if ( lock->m_expires >= nowGlobal ) continue;
// note it for now
if ( g_conf.m_logDebugSpider )
log("spider: removing lock after waiting. elapsed=%" PRId32"."
" lockKey=%" PRIu64" hid=%" PRId32" expires=%" PRIu32" "
"nowGlobal=%" PRIu32,
(nowGlobal - lock->m_timestamp),
lockKey,hostId,
(uint32_t)lock->m_expires,
(uint32_t)nowGlobal);
nuke:
// nuke the slot and possibly re-chain
ht->removeSlot ( i );
// gotta restart from the top since table may have shrunk
goto restart;
}
// store it
s_lastTime = nowGlobal;
}
示例11: log
// . now come here when we got the necessary index lists
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool Msg39::intersectLists ( ) { // bool updateReadInfo ) {
// bail on error
if ( g_errno ) {
hadError:
log("msg39: Had error getting termlists: %s.",
mstrerror(g_errno));
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
//sendReply (m_slot,this,NULL,0,0,true);
return true;
}
// timestamp log
if ( m_debug ) {
log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] "
"Got %"INT32" lists in %"INT64" ms"
, (PTRTYPE)this,m_tmpq.getNumTerms(),
gettimeofdayInMilliseconds() - m_startTime);
m_startTime = gettimeofdayInMilliseconds();
}
// breathe
QUICKPOLL ( m_r->m_niceness );
// ensure collection not deleted from under us
CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum );
if ( ! cr ) {
g_errno = ENOCOLLREC;
goto hadError;
}
// . set the IndexTable so it can set it's score weights from the
// termFreqs of each termId in the query
// . this now takes into account the special termIds used for sorting
// by date (0xdadadada and 0xdadadad2 & TERMID_MASK)
// . it should weight them so much so that the summation of scores
// from other query terms cannot make up for a lower date score
// . this will actually calculate the top
// . this might also change m_tmpq.m_termSigns
// . this won't do anything if it was already called
m_posdbTable.init ( &m_tmpq ,
m_debug ,
this ,
&m_tt ,
m_r->m_collnum,//ptr_coll ,
&m_msg2 , // m_lists ,
//m_tmpq.m_numTerms , // m_numLists
m_r );
// breathe
QUICKPOLL ( m_r->m_niceness );
// . we have to do this here now too
// . but if we are getting weights, we don't need m_tt!
// . actually we were using it before for rat=0/bool queries but
// i got rid of NO_RAT_SLOTS
if ( ! m_allocedTree && ! m_posdbTable.allocTopTree() ) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
//sendReply ( m_slot , this , NULL , 0 , 0 , true);
return true;
}
// if msg2 had ALL empty lists we can cut it int16_t
if ( m_posdbTable.m_topTree->m_numNodes == 0 ) {
//estimateHitsAndSendReply ( );
return true;
}
// we have to allocate this with each call because each call can
// be a different docid range from doDocIdSplitLoop.
if ( ! m_posdbTable.allocWhiteListTable() ) {
log("msg39: Had error allocating white list table: %s.",
mstrerror(g_errno));
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
//sendReply (m_slot,this,NULL,0,0,true);
return true;
}
// do not re do it if doing docid range splitting
m_allocedTree = true;
// . now we must call this separately here, not in allocTopTree()
// . we have to re-set the QueryTermInfos with each docid range split
// since it will set the list ptrs from the msg2 lists
if ( ! m_posdbTable.setQueryTermInfo () ) return true;
// print query term bit numbers here
for ( int32_t i = 0 ; m_debug && i < m_tmpq.getNumTerms() ; i++ ) {
QueryTerm *qt = &m_tmpq.m_qterms[i];
//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
char *tpc = qt->m_term + qt->m_termLen;
char tmp = *tpc;
*tpc = '\0';
SafeBuf sb;
sb.safePrintf("query: msg39: BITNUM query term #%"INT32" \"%s\" "
"bitnum=%"INT32" ", i , qt->m_term, qt->m_bitNum );
//.........这里部分代码省略.........
示例12: reset
// returns false and sets g_errno on error
bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, Query *query,
LinkInfo *linkInfo, Url *firstUrl, const char *filteredRootTitleBuf, int32_t filteredRootTitleBufSize,
uint8_t contentType, uint8_t langId, int32_t niceness ) {
// make Msg20.cpp faster if it is just has
// Msg20Request::m_setForLinkInfo set to true, no need to extricate a title.
if ( maxTitleLen <= 0 ) {
return true;
}
m_niceness = niceness;
m_maxTitleLen = maxTitleLen;
// if this is too big the "first line" algo can be huge!!!
// and really slow everything way down with a huge title candidate
int32_t maxTitleWords = 128;
// assume no title
reset();
int32_t NW = words->getNumWords();
//
// now get all the candidates
//
// . allow up to 100 title CANDIDATES
// . "as" is the word # of the first word in the candidate
// . "bs" is the word # of the last word IN the candidate PLUS ONE
int32_t n = 0;
int32_t as[MAX_TIT_CANDIDATES];
int32_t bs[MAX_TIT_CANDIDATES];
float scores[MAX_TIT_CANDIDATES];
Words *cptrs[MAX_TIT_CANDIDATES];
int32_t types[MAX_TIT_CANDIDATES];
int32_t parent[MAX_TIT_CANDIDATES];
// record the scoring algos effects
float baseScore [MAX_TIT_CANDIDATES];
float noCapsBoost [MAX_TIT_CANDIDATES];
float qtermsBoost [MAX_TIT_CANDIDATES];
float inCommonCandBoost[MAX_TIT_CANDIDATES];
// reset these
for ( int32_t i = 0 ; i < MAX_TIT_CANDIDATES ; i++ ) {
// assume no parent
parent[i] = -1;
}
// xml and words class for each link info, rss item
Xml tx[MAX_TIT_CANDIDATES];
Words tw[MAX_TIT_CANDIDATES];
int32_t ti = 0;
// restrict how many link texts and rss blobs we check for titles
// because title recs like www.google.com have hundreds and can
// really slow things down to like 50ms for title generation
int32_t kcount = 0;
int32_t rcount = 0;
//int64_t x = gettimeofdayInMilliseconds();
// . get every link text
// . TODO: repeat for linkInfo2, the imported link text
for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) {
// breathe
QUICKPOLL(m_niceness);
// fast skip check for link text
if ( k->size_linkText >= 3 && ++kcount >= 20 ) continue;
// fast skip check for rss item
if ( k->size_rssItem > 10 && ++rcount >= 20 ) continue;
// set Url
Url u;
u.set( k->getUrl(), k->size_urlBuf );
// is it the same host as us?
bool sh = true;
// skip if not from same host and should be
if ( firstUrl->getHostLen() != u.getHostLen() ) {
sh = false;
}
// skip if not from same host and should be
if ( strncmp( firstUrl->getHost(), u.getHost(), u.getHostLen() ) ) {
sh = false;
}
// get the link text
if ( k->size_linkText >= 3 ) {
char *p = k->getLinkText();
int32_t plen = k->size_linkText - 1;
if ( ! verifyUtf8 ( p , plen ) ) {
log("title: set4 bad link text from url=%s", k->getUrl());
continue;
}
// now the words.
if ( !tw[ti].set( k->getLinkText(), k->size_linkText - 1, true, 0 ) ) {
//.........这里部分代码省略.........
示例13: is_digit
// . add the phrase that starts with the ith word
// . "read Of Mice and Men" should make 3 phrases:
// . read.ofmice
// . ofmice
// . mice.andmen
void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
// . if the ith word cannot start a phrase then we have no phrase
// . we indicate NULL phrasesIds with a spam of PSKIP
// . we now index all regardless! we want to be able to search
// for "a thing" or something. so do it!
//if ( ! m_bits->canStartPhrase ( i ) ) {
// m_phraseSpam[i] = PSKIP;
// m_phraseIds [i] = 0LL;
// return;
//}
// MDW: now Weights.cpp should encompass all this logic
// or if score <= 0, set in Scores.cpp
//if ( m_wordScores && m_wordScores[i] <= 0 ) {
// m_phraseSpam[i] = PSKIP;
// m_phraseIds [i] = 0LL;
// return;
//}
// hash of the phrase
int64_t h = 0LL;
// the hash of the two-word phrase (now we do 3,4 and 5 word phrases)
int64_t h2 = 0LL;
int64_t h3 = 0LL;
//int64_t h4 = 0LL;
//int64_t h5 = 0LL;
// reset
unsigned char pos = 0;
// now look for other tokens that should follow the ith token
int32_t nw = m_words->getNumWords();
int32_t numWordsInPhrase = 1;
// use the min spam from all words in the phrase as the spam for phrase
char minSpam = -1;
// we need to hash "1 / 8" differently from "1.8" from "1,000" etc.
char isNum = is_digit(m_wptrs[i][0]);
// min score
//int32_t minScore ;
//if ( m_wordScores ) minScore = m_wordScores[i];
// if i is not a stop word, it can set the min spam initially
//if ( ! m_bits->isStopWord(i) &&m_spam ) minSpam = m_spam->getSpam(i);
// do not include punct/tag words in the m_numWordsTotal[j] count
// of the total words in the phrase. these are just usesless tails.
int32_t lastWordj = -1;
// loop over following words
int32_t j;
bool hasHyphen ;
bool hasStopWord2 ;
// . NOTE: a token can start a phrase but NOT be in it.
// . like a large number for example.
// . wordId is the lower ascii hash of the ith word
// . NO... this is allowing the query operator PiiPe to start
// a phrase but not be in it, then the phrase id ends up just
// being the following word's id. causing the synonyms code to
// give a synonym which it should not un Synonyms::set()
if ( ! m_bits->canBeInPhrase(i) )
// so indeed, skip it then
goto nophrase;
//h = hash64 ( h, m_words->getWordId(i));
h = m_wids[i];
// set position
pos = (unsigned char)m_wlens[i];
//if (m_words->getStripWordId(i))
// h2 = hash64 ( h2, m_words->getStripWordId(i));
//else h2 = h;
hasHyphen = false;
hasStopWord2 = m_bits->isStopWord(i);
// this makes it true now too
//if ( m_wlens[i] <= 2 ) hasStopWord = true;
for ( j = i + 1 ; j < nw ; j++ ) {
QUICKPOLL(niceness);
// . do not allow more than 32 alnum/punct "words" in a phrase
// . this prevents phrases with 100,000 words from slowing
// us down. would put us in a huge double-nested for loop
if ( j > i + 32 ) goto nophrase;
// deal with punct words
if ( ! m_wids[j] ) {
// if we cannot pair across word j then break
if ( ! m_bits->canPairAcross (j) ) break;
// does it have a hyphen?
if (j==i+1 && m_words->hasChar(j,'-')) hasHyphen=true;
/*
// "D & B" --> dandb
if (j==i+1 && m_words->hasChar(j,'&')) {
// set this
hasStopWord = true;
// insert "and"
int32_t conti=pos;
h = hash64Lower_utf8_cont("and",3,h,&conti);
pos=conti;
// the two-word phrase, set it if we need to
//.........这里部分代码省略.........
示例14: log
// . when a dump completes we free the primary mem space and make
// the secondary mem space the new primary mem space
void RdbMem::freeDumpedMem( RdbTree *tree ) {
// bail if we have no mem
if ( m_memSize == 0 ) return;
log("rdbmem: start freeing dumped mem");
//char *memEnd = m_mem + m_memSize;
// this should still be true so allocData() returns m_ptr2 ptrs
if ( ! m_rdb->m_inDumpLoop ) { g_process.shutdownAbort(true); }
// count how many data nodes we had to move to avoid corruption
int32_t count = 0;
int32_t scanned = 0;
for ( int32_t i = 0 ; i < tree->m_minUnusedNode ; i++ ) {
// give up control to handle search query stuff of niceness 0
QUICKPOLL ( MAX_NICENESS );
// skip node if parents is -2 (unoccupied)
if ( tree->m_parents[i] == -2 ) continue;
scanned++;
// get the ptr
char *data = tree->m_data[i];
if ( ! data ) continue;
// how could it's data not be stored in here?
// if ( data < m_mem ) {
// log("rdbmem: bad data1");
// continue;
// }
// if ( data >= memEnd ) {
// log("rdbmem: bad data2");
// continue;
// }
// is it in primary mem? m_ptr1 mem was just dump
// if growing upward
bool needsMove = false;
// if the primary mem (that was dumped) is
// growing upwards
if ( m_ptr1 < m_ptr2 ) {
// and the node data is in it...
if ( data < m_ptr1 )
needsMove = true;
}
// growing downward otherwise
else if ( data >= m_ptr1 ) {
needsMove = true;
}
if ( ! needsMove ) continue;
// move it. m_inDumpLoop should still
// be true so we will get added to
// m_ptr2
int32_t size;
if ( tree->m_sizes ) size = tree->m_sizes[i];
else size = tree->m_fixedDataSize;
if ( size < 0 ) { g_process.shutdownAbort(true); }
if ( size == 0 ) continue;
// m_inDumpLoop is still true at this point so
// so allocData should return m_ptr2 guys
char *newData = (char *)allocData(NULL,size,0);
if ( ! newData ) {
log("rdbmem: failed to alloc %i "
"bytes node %i",(int)size,(int)i);
continue;
}
// debug test
bool stillNeedsMove = false;
if ( m_ptr1 < m_ptr2 ) {
// and the node data is in it...
if ( newData < m_ptr1 )
stillNeedsMove = true;
}
// growing downward otherwise
else if ( newData >= m_ptr1 ) {
stillNeedsMove = true;
}
if ( stillNeedsMove ) {// this should never happen!!
log("rdbmem: olddata=0x%" PTRFMT" newdata=0x%" PTRFMT,
(PTRTYPE)data, (PTRTYPE)newData);
log("rdbmem: still needs move!");
}
count++;
gbmemcpy(newData,data,size);
tree->m_data[i] = newData;
}
if ( count > 0 )
log("rdbmem: moved %i tree nodes for %s",(int)count,
m_rdb->m_dbname);
log("rdbmem: stop freeing dumped mem. scanned %i nodes.",(int)scanned);
// save primary ptr
char *tmp = m_ptr1;
// debug
//logf(LOG_DEBUG,
// "db: freeing dumped mem ptr1=%" PRIx32" ptr2=%" PRIx32".",m_ptr1,m_ptr2);
// primary pointer, m_ptr1, becomes m_ptr2
m_ptr1 = m_ptr2;
//.........这里部分代码省略.........
示例15: memset
// returns -1 and sets g_errno on error, because 0 means langUnknown
long Words::getLanguage( Sections *sections ,
long maxSamples,
long niceness,
long *langScore) {
// calculate scores if not given
//Scores calcdScores;
//if ( ! scores ) {
// if ( ! calcdScores.set( this,m_version,false ) )
// return -1;
// scores = &calcdScores;
//}
// . take a random sample of words and look them up in the
// language dictionary
//HashTableT<long long, char> ht;
HashTableX ht;
long long langCount[MAX_LANGUAGES];
long long langWorkArea[MAX_LANGUAGES];
long numWords = m_numWords;
//long skip = numWords/maxSamples;
//if ( skip == 0 ) skip = 1;
// reset the language count
memset(langCount, 0, sizeof(long long)*MAX_LANGUAGES);
// sample the words
//long wordBase = 0;
long wordi = 0;
//if ( ! ht.set(maxSamples*1.5) ) return -1;
if ( ! ht.set(8,1,(long)(maxSamples*8.0),NULL,0,false,
niceness,"wordslang"))
return -1;
// . avoid words in these bad sections
// . google seems to index SEC_MARQUEE so i took that out of badFlags
long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
// shortcuts
long long *wids = m_wordIds;
long *wlens = m_wordLens;
char **wptrs = m_words;
//long langTotal = 0;
// log ( LOG_WARN, "xmldoc: Picking language from %li words with %li skip",
// numWords, skip );
char numOne = 1;
Section **sp = NULL;
if ( sections ) sp = sections->m_sectionPtrs;
// this means null too
if ( sections && sections->m_numSections == 0 ) sp = NULL;
long maxCount = 1000;
while ( wordi < numWords ) {
// breathe
QUICKPOLL( niceness );
// move to the next valid word
if ( ! wids [wordi] ) { wordi++; continue; }
if ( wlens[wordi] < 2 ) { wordi++; continue; }
// skip if in a bad section
//long flags = sections->m_sectionPtrs[i]->m_flags;
// meaning script section ,etc
if ( sp && ( sp[wordi]->m_flags & badFlags ) ) {
wordi++; continue; }
// check the language
//unsigned char lang = 0;
// Skip if word is capitalized and not preceded by a tag
//if(s_isWordCap(getWord(wordi), getWordLen(wordi)) &&
// wordi > 0 && !getTagId(wordi - 1)) {
// wordi++;
// continue;
//}
// Skip word if bounded by '/' or '?' might be in a URL
if(isBounded(wordi)) {
wordi++;
continue;
}
// is it arabic? sometimes they are spammy pages and repeat
// a few arabic words over and over again, so don't do deduping
// with "ht" before checking this.
char cl = getCharacterLanguage ( wptrs[wordi] );
if ( cl ) {
langCount[(unsigned char)cl]++;
wordi++;
continue;
}
//if(ht.getSlot(m_wordIds[wordi]) !=-1) {
if(!ht.isEmpty(&m_wordIds[wordi]) ) {
wordi++;
continue;
}
// If we can't add the word, it's not that bad.
// Just gripe about it in the log.
if(!ht.addKey(&m_wordIds[wordi], &numOne)) {
log(LOG_WARN, "build: Could not add word to temporary "
"table, memory error?\n");
g_errno = ENOMEM;
//.........这里部分代码省略.........