本文整理汇总了C++中SpiderRequest::setKey方法的典型用法代码示例。如果您正苦于以下问题:C++ SpiderRequest::setKey方法的具体用法?C++ SpiderRequest::setKey怎么用?C++ SpiderRequest::setKey使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类SpiderRequest
的用法示例。
在下文中一共展示了SpiderRequest::setKey方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: sendPageAnalyze
//.........这里部分代码省略.........
long isXml = r->getLong("xml",0);
// if got docid, use that
if ( st->m_docId != -1 ) {
if ( ! xd->set3 ( st->m_docId,
st->m_coll,
0 ) ) // niceness
// return error reply if g_errno is set
return sendErrorReply ( st , g_errno );
// make this our callback in case something blocks
xd->setCallback ( st , gotXmlDoc );
xd->m_pbuf = &st->m_wbuf;
// reset this flag
st->m_donePrinting = false;
// . set xd from the old title rec if recycle is true
// . can also use XmlDoc::m_loadFromOldTitleRec flag
//if ( st->m_recycle ) xd->m_recycleContent = true;
xd->m_recycleContent = true;
// force this on
//xd->m_useSiteLinkBuf = true;
//xd->m_usePageLinkBuf = true;
if ( isXml ) xd->m_printInXml = true;
// now tell it to fetch the old title rec
if ( ! xd->loadFromOldTitleRec () )
// return false if this blocks
return false;
return gotXmlDoc ( st );
}
// set this up
SpiderRequest sreq;
sreq.reset();
if ( st->m_u ) strcpy(sreq.m_url,st->m_u);
long firstIp = hash32n(st->m_u);
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
// parentdocid of 0
sreq.setKey( firstIp, 0LL, false );
sreq.m_isPageParser = 1;
sreq.m_hopCount = st->m_hopCount;
sreq.m_hopCountValid = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
Url nu;
nu.set(sreq.m_url);
sreq.m_domHash32 = nu.getDomainHash32();
sreq.m_siteHash32 = nu.getHostHash32();
// . get provided content if any
// . will be NULL if none provided
// . "content" may contain a MIME
long contentLen = 0;
char *content = r->getString ( "content" , &contentLen , NULL );
// is the "content" url-encoded? default is true.
bool contentIsEncoded = true;
// mark doesn't like to url-encode his content
if ( ! content ) {
content = r->getUnencodedContent ();
contentLen = r->getUnencodedContentLen ();
contentIsEncoded = false;
}
// ensure null
if ( contentLen == 0 ) content = NULL;
//uint8_t contentType = CT_HTML;
//if ( isXml ) contentType = CT_XML;
long ctype = r->getLong("ctype",CT_HTML);
// . use the enormous power of our new XmlDoc class
// . this returns false if blocked
if ( ! xd->set4 ( &sreq ,
NULL ,
st->m_coll ,
// we need this so the term table is set!
&st->m_wbuf , // XmlDoc::m_pbuf
0, // try 0 now! 1 ,//PP_NICENESS ))
content ,
false, // deletefromindex
0, // forced ip
ctype ))
// return error reply if g_errno is set
return sendErrorReply ( st , g_errno );
// make this our callback in case something blocks
xd->setCallback ( st , gotXmlDoc );
// reset this flag
st->m_donePrinting = false;
// prevent a core here in the event we download the page content
xd->m_crawlDelayValid = true;
xd->m_crawlDelay = 0;
// . set xd from the old title rec if recycle is true
// . can also use XmlDoc::m_loadFromOldTitleRec flag
//if ( st->m_recycle ) xd->m_recycleContent = true;
// only recycle if docid is given!!
if ( st->m_recycle ) xd->m_recycleContent = true;
// force this on
//xd->m_useSiteLinkBuf = true;
//xd->m_usePageLinkBuf = true;
if ( isXml ) xd->m_printInXml = true;
return gotXmlDoc ( st );
}
示例2: sendPageParser2
//.........这里部分代码省略.........
"</textarea>"
"</td>"
"</tr>"
"</table>"
"</center>"
"</form>"
"<br>",
//oips ,
contentParm );
xbuf->safePrintf(
"<center>"
"<input type=submit value=Submit>"
"</center>"
);
// just print the page if no url given
if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st );
XmlDoc *xd = &st->m_xd;
// set this up
SpiderRequest sreq;
sreq.reset();
strcpy(sreq.m_url,st->m_u);
long firstIp = hash32n(st->m_u);
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
// parentdocid of 0
sreq.setKey( firstIp, 0LL, false );
sreq.m_isPageParser = 1;
sreq.m_hopCount = st->m_hopCount;
sreq.m_hopCountValid = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
Url nu;
nu.set(sreq.m_url);
sreq.m_domHash32 = nu.getDomainHash32();
sreq.m_siteHash32 = nu.getHostHash32();
// . get provided content if any
// . will be NULL if none provided
// . "content" may contain a MIME
long contentLen = 0;
char *content = r->getString ( "content" , &contentLen , NULL );
// is the "content" url-encoded? default is true.
bool contentIsEncoded = true;
// mark doesn't like to url-encode his content
if ( ! content ) {
content = r->getUnencodedContent ();
contentLen = r->getUnencodedContentLen ();
contentIsEncoded = false;
}
// ensure null
if ( contentLen == 0 ) content = NULL;
uint8_t contentType = CT_HTML;
if ( r->getBool("xml",0) ) contentType = CT_XML;
contentType = r->getLong("ctype",contentType);//CT_HTML);
示例3:
// . "uf" is printf url format to scrape with a %s for the query
// . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0";
bool Msg7::scrapeQuery ( ) {
// advance round now in case we return early
m_round++;
// error?
if ( m_qbuf.length() > 500 ) {
g_errno = EQUERYTOOBIG;
return true;
}
// first encode the query
SafeBuf ebuf;
ebuf.urlEncode ( m_qbuf.getBufStart() ); // queryUNEncoded );
char *uf;
if ( m_round == 1 )
// set to 1 for debugging
uf="http://www.google.com/search?num=20&"
"q=%s&scoring=d&filter=0";
//uf = "https://startpage.com/do/search?q=%s";
//uf = "http://www.google.com/"
// "/cse?cx=013269018370076798483%3A8eec3papwpi&"
// "ie=UTF-8&q=%s&"
// "num=20";
else
uf="http://www.bing.com/search?q=%s";
// skip bing for now
//if ( m_round == 2 )
// return true;
//if ( m_round == 1 )
// return true;
// make the url we will download
char ubuf[2048];
sprintf ( ubuf , uf , ebuf.getBufStart() );
// log it
log("inject: SCRAPING %s",ubuf);
SpiderRequest sreq;
sreq.reset();
// set the SpiderRequest
strcpy(sreq.m_url, ubuf);
// . tell it to only add the hosts of each outlink for now!
// . that will be passed on to when XmlDoc calls Links::set() i guess
// . xd will not reschedule the scraped url into spiderdb either
sreq.m_isScraping = 1;
sreq.m_fakeFirstIp = 1;
long firstIp = hash32n(ubuf);
if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
sreq.m_firstIp = firstIp;
// parent docid is 0
sreq.setKey(firstIp,0LL,false);
// forceDEl = false, niceness = 0
m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 );
//m_xd.m_isScraping = true;
// download without throttling
//m_xd.m_throttleDownload = false;
// disregard this
m_xd.m_useRobotsTxt = false;
// this will tell it to index ahrefs first before indexing
// the doc. but do NOT do this if we are from ahrefs.com
// ourselves to avoid recursive explosion!!
if ( m_useAhrefs )
m_xd.m_useAhrefs = true;
m_xd.m_reallyInjectLinks = m_injectLinks;
//
// rather than just add the links of the page to spiderdb,
// let's inject them!
//
m_xd.setCallback ( this , doneInjectingLinksWrapper );
// niceness is 0
m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2");
// do we actually inject the links, or just scrape?
if ( ! m_xd.injectLinks ( &m_linkDedupTable ,
NULL,
this ,
doneInjectingLinksWrapper ) )
return false;
// otherwise, just download the google/bing search results so we
// can display them in xml
//else if ( m_xd.getUtf8Content() == (char **)-1 )
// return false;
// print reply..
//printReply();
return true;
//.........这里部分代码省略.........
示例4: void
bool Msg7::inject ( char *url ,
long forcedIp ,
char *content ,
long contentLen ,
bool recycleContent,
uint8_t contentType,
char *coll ,
bool quickReply ,
char *username ,
char *pwd ,
long niceness,
void *state ,
void (*callback)(void *state),
long firstIndexed,
long lastSpidered,
long hopCount,
char newOnly,
short charset,
char spiderLinks,
char deleteIt,
char hasMime,
bool doConsistencyTesting
) {
m_quickReply = quickReply;
// store coll
if ( ! coll ) { g_errno = ENOCOLLREC; return true; }
long collLen = gbstrlen ( coll );
if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
strncpy ( m_coll , coll , collLen );
m_coll [ collLen ] = '\0';
// store user
//long ulen = 0;
//if ( username ) ulen = gbstrlen(username);
//if ( ulen >= MAX_USER_SIZE-1 ) {g_errno = EBUFOVERFLOW; return true;}
//if ( username ) strcpy( m_username, username );
// store password
//long pwdLen = 0;
//if ( pwd ) pwdLen = gbstrlen(pwd);
//m_pwd [ 0 ] ='\0';
//if ( pwdLen > 31 ) pwdLen = 31;
//if ( pwdLen > 0 ) strncpy ( m_pwd , pwd , pwdLen );
//m_pwd [ pwdLen ] = '\0';
// store url
if ( ! url ) { g_errno = 0; return true; }
long urlLen = gbstrlen(url);
if ( urlLen > MAX_URL_LEN ) {g_errno = EBADENGINEER; return true; }
// skip injecting if no url given! just print the admin page.
if ( urlLen <= 0 ) return true;
//strcpy ( m_url , url );
if ( g_repairMode ) { g_errno = EREPAIRING; return true; }
// send template reply if no content supplied
if ( ! content && ! recycleContent ) {
log("inject: no content supplied to inject command and "
"recycleContent is false.");
//return true;
}
// clean url?
// normalize and add www. if it needs it
Url uu;
uu.set ( url , gbstrlen(url) , true );
// remove >'s i guess and store in st1->m_url[] buffer
char cleanUrl[MAX_URL_LEN+1];
urlLen = cleanInput ( cleanUrl,
MAX_URL_LEN,
uu.getUrl(),
uu.getUrlLen() );
// this can go on the stack since set4() copies it
SpiderRequest sreq;
sreq.reset();
strcpy(sreq.m_url, cleanUrl );
// parentdocid of 0
long firstIp = hash32n(cleanUrl);
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
sreq.setKey( firstIp,0LL, false );
sreq.m_isInjecting = 1;
sreq.m_isPageInject = 1;
sreq.m_hopCount = hopCount;
sreq.m_hopCountValid = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
// shortcut
XmlDoc *xd = &m_xd;
// log it now
//log("inject: injecting doc %s",cleanUrl);
static char s_dummy[3];
// sometims the content is indeed NULL...
if ( newOnly && ! content ) {
//.........这里部分代码省略.........
示例5: gotPhrase
void Scraper::gotPhrase ( ) {
// error getting random phrase? bail!
if ( g_errno ) log("scraper: got error getting random phrase: %s",
mstrerror(g_errno));
CollectionRec *cr = g_collectiondb.getRec ( m_coll );
loop:
// what type of query should we do?
m_qtype = rand() % 3;
// make sure web, news, blog is enabled
if ( m_qtype == 0 && ! cr->m_scrapingEnabledWeb ) goto loop;
if ( m_qtype == 1 && ! cr->m_scrapingEnabledNews ) goto loop;
if ( m_qtype == 2 && ! cr->m_scrapingEnabledBlogs ) goto loop;
// scraping is off when repairing obviously
if ( g_repairMode ) return;
// get it
char *s = g_wiki.m_randPhrase;
// convert _'s to spaces
for ( char *p = s ; *p ; p++ )
if ( *p == '_' ) *p = ' ';
// . url encode the random phrase
// . truncate it to 200 bytes to keep things sane
// . Wiki::doneReadingWiki() keeps it below 128 i think anyway
char qe[400];
urlEncode(qe, 200, s , gbstrlen(s) );
char *end = qe + 390;
// half the time append a random word from dictionary so that we
// discovery those tail-end sites better
if ( m_qtype == 0 && (rand() % 2) ) {
// point into it for appending
char *p = qe + gbstrlen(qe);
// add a space, url encoded
*p++ = '+';
// append a random word to it from dictionary
char *rw = g_speller.getRandomWord();
// append that in
urlEncode( p , end - p - 1 , rw , gbstrlen(rw) );
}
// make a query to scrape
char buf[2048];
char *uf ;
if ( m_qtype == 0 )
uf="http://www.google.com/search?num=50&q=%s&scoring=d"
"&filter=0";
// google news query? sort by date.
else if ( m_qtype == 1 )
uf="http://news.google.com/news?num=50&q=%s&sort=n"
"&filter=0";
// google blog query?
else if ( m_qtype == 2 )
uf="http://www.google.com/blogsearch?num=50&q=%s&scoring=d"
"&filter=0";
// sanity check
else { char *xx=NULL;*xx=0; }
// make the url we will download
sprintf ( buf , uf , qe );
SpiderRequest sreq;
// set the SpiderRequest
strcpy(sreq.m_url, uf);
// . tell it to only add the hosts of each outlink for now!
// . that will be passed on to when XmlDoc calls Links::set() i guess
// . xd will not reschedule the scraped url into spiderdb either
sreq.m_isScraping = 1;
sreq.m_fakeFirstIp = 1;
long firstIp = hash32n(uf);
if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
sreq.m_firstIp = firstIp;
// parent docid is 0
sreq.setKey(firstIp,0LL,false);
// forceDEl = false, niceness = 0
m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 );
//m_xd.m_isScraping = true;
// download without throttling
//m_xd.m_throttleDownload = false;
// disregard this
m_xd.m_useRobotsTxt = false;
// call this when index completes
m_xd.setCallback ( NULL , indexedDocWrapper );
// assume it blocked
m_numSent++;
// scraper is special
m_xd.m_usePosdb = false;
m_xd.m_useDatedb = false;
m_xd.m_useClusterdb = false;
//.........这里部分代码省略.........
示例6: sendPageAddUrl
//.........这里部分代码省略.........
// . should we force it into spiderdb even if already in there
// . use to manually update spider times for a url
// . however, will not remove old scheduled spider times
// . mdw: made force on the default
st1->m_forceRespider = r->getLong("force",1); // 0);
long now = getTimeGlobal();
// . allow 1 submit every 1 hour
// . restrict by submitter domain ip
if ( ! st1->m_isAdmin &&
! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) {
// return error page
g_errno = ETOOEARLY;
return sendReply ( st1 , true );
}
//st1->m_query = r->getString( "qts", &st1->m_queryLen );
// check it, if turing test is enabled for this collection
if ( ! st1->m_isAdmin && cr->m_doTuringTest &&
! g_turingTest.isHuman(r) ) {
// log note so we know it didn't make it
g_msg = " (error: bad answer)";
//log("PageAddUrl:: addurl failed for %s : bad answer",
// iptoa(s->m_ip));
st1->m_goodAnswer = false;
return sendReply ( st1 , true /*addUrl enabled?*/ );
}
//if ( st1->m_queryLen > 0 )
// return getPages( st1 );
// if no url given, just print a blank page
if ( ! url ) return sendReply ( st1 , true );
//
// make a SpiderRequest
//
SpiderRequest *sreq = &st1->m_sreq;
// reset it
sreq->reset();
// make the probable docid
long long probDocId = g_titledb.getProbableDocId ( st1->m_url );
// make one up, like we do in PageReindex.cpp
long firstIp = (probDocId & 0xffffffff);
// . now fill it up
// . TODO: calculate the other values... lazy!!! (m_isRSSExt,
// m_siteNumInlinks,...)
sreq->m_isNewOutlink = 1;
sreq->m_isAddUrl = 1;
sreq->m_addedTime = now;
sreq->m_fakeFirstIp = 1;
sreq->m_probDocId = probDocId;
sreq->m_firstIp = firstIp;
sreq->m_hopCount = 0;
// its valid if root
Url uu; uu.set ( st1->m_url );
if ( uu.isRoot() ) sreq->m_hopCountValid = true;
// too big?
//long len = st1->m_urlLen;
// the url! includes \0
strcpy ( sreq->m_url , st1->m_url );
// call this to set sreq->m_dataSize now
sreq->setDataSize();
// make the key dude -- after setting url
sreq->setKey ( firstIp , 0LL, false );
// need a fake first ip lest we core!
//sreq->m_firstIp = (pdocId & 0xffffffff);
// how to set m_firstIp? i guess addurl can be throttled independently
// of the other urls??? use the hash of the domain for it!
long dlen;
char *dom = getDomFast ( st1->m_url , &dlen );
// fake it for this...
//sreq->m_firstIp = hash32 ( dom , dlen );
// sanity
if ( ! dom ) {
g_errno = EBADURL;
return sendReply ( st1 , true );
}
// shortcut
Msg4 *m = &st1->m_msg4;
// now add that to spiderdb using msg4
if ( ! m->addMetaList ( (char *)sreq ,
sreq->getRecSize() ,
coll ,
st1 , // state
addedStuff ,
MAX_NICENESS ,
RDB_SPIDERDB ) )
// we blocked
return false;
// send back the reply
return sendReply ( st1 , true );
}