本文整理汇总了C++中SpiderRequest类的典型用法代码示例。如果您正苦于以下问题:C++ SpiderRequest类的具体用法?C++ SpiderRequest怎么用?C++ SpiderRequest使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SpiderRequest类的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: sendPageAnalyze
//.........这里部分代码省略.........
XmlDoc *xd = &st->m_xd;
long isXml = r->getLong("xml",0);
// if got docid, use that
if ( st->m_docId != -1 ) {
if ( ! xd->set3 ( st->m_docId,
st->m_coll,
0 ) ) // niceness
// return error reply if g_errno is set
return sendErrorReply ( st , g_errno );
// make this our callback in case something blocks
xd->setCallback ( st , gotXmlDoc );
xd->m_pbuf = &st->m_wbuf;
// reset this flag
st->m_donePrinting = false;
// . set xd from the old title rec if recycle is true
// . can also use XmlDoc::m_loadFromOldTitleRec flag
//if ( st->m_recycle ) xd->m_recycleContent = true;
xd->m_recycleContent = true;
// force this on
//xd->m_useSiteLinkBuf = true;
//xd->m_usePageLinkBuf = true;
if ( isXml ) xd->m_printInXml = true;
// now tell it to fetch the old title rec
if ( ! xd->loadFromOldTitleRec () )
// return false if this blocks
return false;
return gotXmlDoc ( st );
}
// set this up
SpiderRequest sreq;
sreq.reset();
if ( st->m_u ) strcpy(sreq.m_url,st->m_u);
long firstIp = hash32n(st->m_u);
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
// parentdocid of 0
sreq.setKey( firstIp, 0LL, false );
sreq.m_isPageParser = 1;
sreq.m_hopCount = st->m_hopCount;
sreq.m_hopCountValid = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
Url nu;
nu.set(sreq.m_url);
sreq.m_domHash32 = nu.getDomainHash32();
sreq.m_siteHash32 = nu.getHostHash32();
// . get provided content if any
// . will be NULL if none provided
// . "content" may contain a MIME
long contentLen = 0;
char *content = r->getString ( "content" , &contentLen , NULL );
// is the "content" url-encoded? default is true.
bool contentIsEncoded = true;
// mark doesn't like to url-encode his content
if ( ! content ) {
content = r->getUnencodedContent ();
contentLen = r->getUnencodedContentLen ();
contentIsEncoded = false;
}
// ensure null
if ( contentLen == 0 ) content = NULL;
示例2: sendPageGet
//.........这里部分代码省略.........
// . we need to match summary here so we need to know this
//bool seq = r->getLong ( "seq" , false );
// restrict to root file?
bool rtq = r->getLong ( "rtq" , false );
// . get the titleRec
// . TODO: redirect client to a better http server to save bandwidth
State2 *st ;
try { st = new (State2); }
catch (... ) {
g_errno = ENOMEM;
log("PageGet: new(%i): %s",
(int)sizeof(State2),mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
mnew ( st , sizeof(State2) , "PageGet1" );
// save the socket and if Host: is local in the Http request Mime
st->m_socket = s;
st->m_isAdmin = g_conf.isCollAdmin ( s , r );
st->m_isLocal = r->isLocal();
st->m_docId = docId;
st->m_printed = false;
// include header ... "this page cached by Gigablast on..."
st->m_includeHeader = r->getLong ("ih" , true );
st->m_includeBaseHref = r->getLong ("ibh" , false );
st->m_queryHighlighting = r->getLong ("qh" , true );
st->m_strip = r->getLong ("strip" , 0 );
st->m_clickAndScroll = r->getLong ("cas" , true );
st->m_cnsPage = r->getLong ("cnsp" , true );
char *langAbbr = r->getString("qlang",NULL);
st->m_langId = langUnknown;
if ( langAbbr ) {
uint8_t langId = getLangIdFromAbbr ( langAbbr );
st->m_langId = langId;
}
strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
// store query for query highlighting
st->m_netTestResults = r->getLong ("rnettest", false );
//if( st->m_netTestResults ) {
// mdelete ( st , sizeof(State2) , "PageGet1" );
// delete ( st );
// return sendPageNetResult( s );
//}
if ( q && qlen > 0 ) strcpy ( st->m_q , q );
else st->m_q[0] = '\0';
st->m_qlen = qlen;
//st->m_seq = seq;
st->m_rtq = rtq;
st->m_boolFlag = r->getLong ("bq", 2 /*default is 2*/ );
st->m_isBanned = false;
st->m_noArchive = false;
st->m_socket = s;
st->m_format = r->getReplyFormat();
// default to 0 niceness
st->m_niceness = 0;
st->m_r.copy ( r );
//st->m_cr = cr;
st->m_printDisclaimer = true;
if ( st->m_cnsPage )
st->m_printDisclaimer = false;
if ( st->m_strip ) // ! st->m_evbits.isEmpty() )
st->m_printDisclaimer = false;
// should we cache it?
char useCache = r->getLong ( "usecache" , 1 );
char rcache = r->getLong ( "rcache" , 1 );
char wcache = r->getLong ( "wcache" , 1 );
long cacheAge = r->getLong ( "cacheAge" , 60*60 ); // default one hour
if ( useCache == 0 ) { cacheAge = 0; wcache = 0; }
if ( rcache == 0 ) cacheAge = 0;
// . fetch the TitleRec
// . a max cache age of 0 means not to read from the cache
XmlDoc *xd = &st->m_xd;
// url based?
if ( url ) {
SpiderRequest sreq;
sreq.reset();
strcpy(sreq.m_url, url );
sreq.setDataSize();
// this returns false if "coll" is invalid
if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) )
goto hadSetError;
}
// . when getTitleRec() is called it will load the old one
// since XmlDoc::m_setFromTitleRec will be true
// . niceness is 0
// . use st->m_coll since XmlDoc just points to it!
// . this returns false if "coll" is invalid
else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) {
hadSetError:
mdelete ( st , sizeof(State2) , "PageGet1" );
delete ( st );
g_errno = ENOMEM;
log("PageGet: set3: %s", mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
}
// if it blocks while it loads title rec, it will re-call this routine
xd->setCallback ( st , processLoopWrapper );
// good to go!
return processLoop ( st );
}
示例3: sendPageParser2
//.........这里部分代码省略.........
"rather than downloading it from the web."
"</td>"
"<td>"
"<textarea rows=10 cols=80 name=content>"
"%s"
"</textarea>"
"</td>"
"</tr>"
"</table>"
"</center>"
"</form>"
"<br>",
//oips ,
contentParm );
xbuf->safePrintf(
"<center>"
"<input type=submit value=Submit>"
"</center>"
);
// just print the page if no url given
if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st );
XmlDoc *xd = &st->m_xd;
// set this up
SpiderRequest sreq;
sreq.reset();
strcpy(sreq.m_url,st->m_u);
long firstIp = hash32n(st->m_u);
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
// parentdocid of 0
sreq.setKey( firstIp, 0LL, false );
sreq.m_isPageParser = 1;
sreq.m_hopCount = st->m_hopCount;
sreq.m_hopCountValid = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
Url nu;
nu.set(sreq.m_url);
sreq.m_domHash32 = nu.getDomainHash32();
sreq.m_siteHash32 = nu.getHostHash32();
// . get provided content if any
// . will be NULL if none provided
// . "content" may contain a MIME
long contentLen = 0;
char *content = r->getString ( "content" , &contentLen , NULL );
// is the "content" url-encoded? default is true.
bool contentIsEncoded = true;
// mark doesn't like to url-encode his content
if ( ! content ) {
content = r->getUnencodedContent ();
contentLen = r->getUnencodedContentLen ();
contentIsEncoded = false;
}
// ensure null
if ( contentLen == 0 ) content = NULL;
示例4: log
// . this returns false if blocks, true otherwise
// . sets g_errno on failure
bool Msg1c::gotList ( ) {
if ( g_errno ) return true;
int64_t *tmpDocIds = m_msg3a.getDocIds();
int32_t numDocIds = m_msg3a.getNumDocIds();
if ( m_startNum > 0) {
numDocIds -= m_startNum;
tmpDocIds = &tmpDocIds[m_startNum];
}
m_numDocIds = numDocIds; // save for reporting
// log it
log(LOG_INFO,"admin: Got %" PRId32" docIds for query reindex.", numDocIds);
// bail if no need
if ( numDocIds <= 0 ) return true;
// force spiders on on entire network. they will progagate from
// host #0...
g_conf.m_spideringEnabled = true;
int32_t nowGlobal = getTimeGlobal();
HashTableX dt;
char dbuf[1024];
dt.set(8,0,64,dbuf,1024,false,0,"ddocids");
m_sb.setLabel("reiadd");
State13 *st = (State13 *)m_state;
GigablastRequest *gr = &st->m_gr;
m_numDocIdsAdded = 0;
// list consists of docIds, loop through each one
for(int32_t i = 0; i < numDocIds; i++) {
int64_t docId = tmpDocIds[i];
// when searching events we get multiple docids that are same
if ( dt.isInTable ( &docId ) ) continue;
// add it
if ( ! dt.addKey ( &docId ) ) return true;
SpiderRequest sr;
sr.reset();
// url is a docid!
sprintf ( sr.m_url , "%" PRIu64 , docId );
// make a fake first ip
// use only 64k values so we don't stress doledb/waittrees/etc.
// for large #'s of docids
int32_t firstIp = (docId & 0x0000ffff);
// bits 6-13 of the docid are the domain hash so use those
// when doing a REINDEX (not delete!) to ensure that requests
// on the same domain go to the same shard, at least when
// we have up to 256 shards. if we have more than 256 shards
// at this point some shards will not participate in the
// query reindex/delete process because of this, so
// we'll want to allow more bits in in that case perhaps.
// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
// to see what shard is responsible for storing and indexing
// this SpiderRequest based on the firstIp.
if ( ! m_forceDel ) {
// if we are a REINDEX not a delete because
// deletes don't need to spider/redownload the doc
// so the distribution can be more random
firstIp >>= 6;
firstIp &= 0xff;
}
// 0 is not a legit val. it'll core below.
if ( firstIp == 0 ) {
firstIp = 1;
}
// use a fake ip
sr.m_firstIp = firstIp;
// we are not really injecting...
sr.m_isInjecting = false;//true;
sr.m_hopCount = -1;
sr.m_isPageReindex = 1;
sr.m_urlIsDocId = 1;
sr.m_fakeFirstIp = 1;
// now you can recycle content instead of re-downloading it
// for every docid
sr.m_recycleContent = gr->m_recycleContent;
// if this is zero we end up getting deduped in
// dedupSpiderList() if there was a SpiderReply whose
// spider time was > 0
sr.m_addedTime = nowGlobal;
sr.m_forceDelete = m_forceDel ? 1 : 0;
// . complete its m_key member
// . parentDocId is used to make the key, but only allow one
// page reindex spider request per url... so use "0"
//.........这里部分代码省略.........
示例5: sprintf
// . "uf" is printf url format to scrape with a %s for the query
// . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0";
bool Msg7::scrapeQuery ( ) {
// advance round now in case we return early
m_round++;
// error?
if ( m_qbuf.length() > 500 ) {
g_errno = EQUERYTOOBIG;
return true;
}
// first encode the query
SafeBuf ebuf;
ebuf.urlEncode ( m_qbuf.getBufStart() ); // queryUNEncoded );
char *uf;
if ( m_round == 1 )
// set to 1 for debugging
uf="http://www.google.com/search?num=20&"
"q=%s&scoring=d&filter=0";
//uf = "https://startpage.com/do/search?q=%s";
//uf = "http://www.google.com/"
// "/cse?cx=013269018370076798483%3A8eec3papwpi&"
// "ie=UTF-8&q=%s&"
// "num=20";
else
uf="http://www.bing.com/search?q=%s";
// skip bing for now
//if ( m_round == 2 )
// return true;
//if ( m_round == 1 )
// return true;
// make the url we will download
char ubuf[2048];
sprintf ( ubuf , uf , ebuf.getBufStart() );
// log it
log("inject: SCRAPING %s",ubuf);
SpiderRequest sreq;
sreq.reset();
// set the SpiderRequest
strcpy(sreq.m_url, ubuf);
// . tell it to only add the hosts of each outlink for now!
// . that will be passed on to when XmlDoc calls Links::set() i guess
// . xd will not reschedule the scraped url into spiderdb either
sreq.m_isScraping = 1;
sreq.m_fakeFirstIp = 1;
long firstIp = hash32n(ubuf);
if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
sreq.m_firstIp = firstIp;
// parent docid is 0
sreq.setKey(firstIp,0LL,false);
// forceDEl = false, niceness = 0
m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 );
//m_xd.m_isScraping = true;
// download without throttling
//m_xd.m_throttleDownload = false;
// disregard this
m_xd.m_useRobotsTxt = false;
// this will tell it to index ahrefs first before indexing
// the doc. but do NOT do this if we are from ahrefs.com
// ourselves to avoid recursive explosion!!
if ( m_useAhrefs )
m_xd.m_useAhrefs = true;
m_xd.m_reallyInjectLinks = m_injectLinks;
//
// rather than just add the links of the page to spiderdb,
// let's inject them!
//
m_xd.setCallback ( this , doneInjectingLinksWrapper );
// niceness is 0
m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2");
// do we actually inject the links, or just scrape?
if ( ! m_xd.injectLinks ( &m_linkDedupTable ,
NULL,
this ,
doneInjectingLinksWrapper ) )
return false;
// otherwise, just download the google/bing search results so we
// can display them in xml
//else if ( m_xd.getUtf8Content() == (char **)-1 )
// return false;
// print reply..
//printReply();
return true;
//.........这里部分代码省略.........
示例6: void
bool Msg7::inject ( char *url ,
long forcedIp ,
char *content ,
long contentLen ,
bool recycleContent,
uint8_t contentType,
char *coll ,
bool quickReply ,
char *username ,
char *pwd ,
long niceness,
void *state ,
void (*callback)(void *state),
long firstIndexed,
long lastSpidered,
long hopCount,
char newOnly,
short charset,
char spiderLinks,
char deleteIt,
char hasMime,
bool doConsistencyTesting
) {
m_quickReply = quickReply;
// store coll
if ( ! coll ) { g_errno = ENOCOLLREC; return true; }
long collLen = gbstrlen ( coll );
if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
strncpy ( m_coll , coll , collLen );
m_coll [ collLen ] = '\0';
// store user
//long ulen = 0;
//if ( username ) ulen = gbstrlen(username);
//if ( ulen >= MAX_USER_SIZE-1 ) {g_errno = EBUFOVERFLOW; return true;}
//if ( username ) strcpy( m_username, username );
// store password
//long pwdLen = 0;
//if ( pwd ) pwdLen = gbstrlen(pwd);
//m_pwd [ 0 ] ='\0';
//if ( pwdLen > 31 ) pwdLen = 31;
//if ( pwdLen > 0 ) strncpy ( m_pwd , pwd , pwdLen );
//m_pwd [ pwdLen ] = '\0';
// store url
if ( ! url ) { g_errno = 0; return true; }
long urlLen = gbstrlen(url);
if ( urlLen > MAX_URL_LEN ) {g_errno = EBADENGINEER; return true; }
// skip injecting if no url given! just print the admin page.
if ( urlLen <= 0 ) return true;
//strcpy ( m_url , url );
if ( g_repairMode ) { g_errno = EREPAIRING; return true; }
// send template reply if no content supplied
if ( ! content && ! recycleContent ) {
log("inject: no content supplied to inject command and "
"recycleContent is false.");
//return true;
}
// clean url?
// normalize and add www. if it needs it
Url uu;
uu.set ( url , gbstrlen(url) , true );
// remove >'s i guess and store in st1->m_url[] buffer
char cleanUrl[MAX_URL_LEN+1];
urlLen = cleanInput ( cleanUrl,
MAX_URL_LEN,
uu.getUrl(),
uu.getUrlLen() );
// this can go on the stack since set4() copies it
SpiderRequest sreq;
sreq.reset();
strcpy(sreq.m_url, cleanUrl );
// parentdocid of 0
long firstIp = hash32n(cleanUrl);
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
sreq.setKey( firstIp,0LL, false );
sreq.m_isInjecting = 1;
sreq.m_isPageInject = 1;
sreq.m_hopCount = hopCount;
sreq.m_hopCountValid = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
// shortcut
XmlDoc *xd = &m_xd;
// log it now
//log("inject: injecting doc %s",cleanUrl);
static char s_dummy[3];
// sometims the content is indeed NULL...
if ( newOnly && ! content ) {
//.........这里部分代码省略.........
示例7: mstrerror
void Scraper::gotPhrase ( ) {
// error getting random phrase? bail!
if ( g_errno ) log("scraper: got error getting random phrase: %s",
mstrerror(g_errno));
CollectionRec *cr = g_collectiondb.getRec ( m_coll );
loop:
// what type of query should we do?
m_qtype = rand() % 3;
// make sure web, news, blog is enabled
if ( m_qtype == 0 && ! cr->m_scrapingEnabledWeb ) goto loop;
if ( m_qtype == 1 && ! cr->m_scrapingEnabledNews ) goto loop;
if ( m_qtype == 2 && ! cr->m_scrapingEnabledBlogs ) goto loop;
// scraping is off when repairing obviously
if ( g_repairMode ) return;
// get it
char *s = g_wiki.m_randPhrase;
// convert _'s to spaces
for ( char *p = s ; *p ; p++ )
if ( *p == '_' ) *p = ' ';
// . url encode the random phrase
// . truncate it to 200 bytes to keep things sane
// . Wiki::doneReadingWiki() keeps it below 128 i think anyway
char qe[400];
urlEncode(qe, 200, s , gbstrlen(s) );
char *end = qe + 390;
// half the time append a random word from dictionary so that we
// discovery those tail-end sites better
if ( m_qtype == 0 && (rand() % 2) ) {
// point into it for appending
char *p = qe + gbstrlen(qe);
// add a space, url encoded
*p++ = '+';
// append a random word to it from dictionary
char *rw = g_speller.getRandomWord();
// append that in
urlEncode( p , end - p - 1 , rw , gbstrlen(rw) );
}
// make a query to scrape
char buf[2048];
char *uf ;
if ( m_qtype == 0 )
uf="http://www.google.com/search?num=50&q=%s&scoring=d"
"&filter=0";
// google news query? sort by date.
else if ( m_qtype == 1 )
uf="http://news.google.com/news?num=50&q=%s&sort=n"
"&filter=0";
// google blog query?
else if ( m_qtype == 2 )
uf="http://www.google.com/blogsearch?num=50&q=%s&scoring=d"
"&filter=0";
// sanity check
else { char *xx=NULL;*xx=0; }
// make the url we will download
sprintf ( buf , uf , qe );
SpiderRequest sreq;
// set the SpiderRequest
strcpy(sreq.m_url, uf);
// . tell it to only add the hosts of each outlink for now!
// . that will be passed on to when XmlDoc calls Links::set() i guess
// . xd will not reschedule the scraped url into spiderdb either
sreq.m_isScraping = 1;
sreq.m_fakeFirstIp = 1;
long firstIp = hash32n(uf);
if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
sreq.m_firstIp = firstIp;
// parent docid is 0
sreq.setKey(firstIp,0LL,false);
// forceDEl = false, niceness = 0
m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 );
//m_xd.m_isScraping = true;
// download without throttling
//m_xd.m_throttleDownload = false;
// disregard this
m_xd.m_useRobotsTxt = false;
// call this when index completes
m_xd.setCallback ( NULL , indexedDocWrapper );
// assume it blocked
m_numSent++;
// scraper is special
m_xd.m_usePosdb = false;
m_xd.m_useDatedb = false;
m_xd.m_useClusterdb = false;
//.........这里部分代码省略.........
示例8: sendPageAddUrl
//.........这里部分代码省略.........
// . should we force it into spiderdb even if already in there
// . use to manually update spider times for a url
// . however, will not remove old scheduled spider times
// . mdw: made force on the default
st1->m_forceRespider = r->getLong("force",1); // 0);
long now = getTimeGlobal();
// . allow 1 submit every 1 hour
// . restrict by submitter domain ip
if ( ! st1->m_isAdmin &&
! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) {
// return error page
g_errno = ETOOEARLY;
return sendReply ( st1 , true );
}
//st1->m_query = r->getString( "qts", &st1->m_queryLen );
// check it, if turing test is enabled for this collection
if ( ! st1->m_isAdmin && cr->m_doTuringTest &&
! g_turingTest.isHuman(r) ) {
// log note so we know it didn't make it
g_msg = " (error: bad answer)";
//log("PageAddUrl:: addurl failed for %s : bad answer",
// iptoa(s->m_ip));
st1->m_goodAnswer = false;
return sendReply ( st1 , true /*addUrl enabled?*/ );
}
//if ( st1->m_queryLen > 0 )
// return getPages( st1 );
// if no url given, just print a blank page
if ( ! url ) return sendReply ( st1 , true );
//
// make a SpiderRequest
//
SpiderRequest *sreq = &st1->m_sreq;
// reset it
sreq->reset();
// make the probable docid
long long probDocId = g_titledb.getProbableDocId ( st1->m_url );
// make one up, like we do in PageReindex.cpp
long firstIp = (probDocId & 0xffffffff);
// . now fill it up
// . TODO: calculate the other values... lazy!!! (m_isRSSExt,
// m_siteNumInlinks,...)
sreq->m_isNewOutlink = 1;
sreq->m_isAddUrl = 1;
sreq->m_addedTime = now;
sreq->m_fakeFirstIp = 1;
sreq->m_probDocId = probDocId;
sreq->m_firstIp = firstIp;
sreq->m_hopCount = 0;
// its valid if root
Url uu; uu.set ( st1->m_url );
if ( uu.isRoot() ) sreq->m_hopCountValid = true;
// too big?
//long len = st1->m_urlLen;
// the url! includes \0
strcpy ( sreq->m_url , st1->m_url );
// call this to set sreq->m_dataSize now
sreq->setDataSize();
// make the key dude -- after setting url
sreq->setKey ( firstIp , 0LL, false );
// need a fake first ip lest we core!
//sreq->m_firstIp = (pdocId & 0xffffffff);
// how to set m_firstIp? i guess addurl can be throttled independently
// of the other urls??? use the hash of the domain for it!
long dlen;
char *dom = getDomFast ( st1->m_url , &dlen );
// fake it for this...
//sreq->m_firstIp = hash32 ( dom , dlen );
// sanity
if ( ! dom ) {
g_errno = EBADURL;
return sendReply ( st1 , true );
}
// shortcut
Msg4 *m = &st1->m_msg4;
// now add that to spiderdb using msg4
if ( ! m->addMetaList ( (char *)sreq ,
sreq->getRecSize() ,
coll ,
st1 , // state
addedStuff ,
MAX_NICENESS ,
RDB_SPIDERDB ) )
// we blocked
return false;
// send back the reply
return sendReply ( st1 , true );
}