当前位置: 首页>>代码示例>>C++>>正文


C++ SpiderRequest::setKey方法代码示例

本文整理汇总了C++中SpiderRequest::setKey方法的典型用法代码示例。如果您正苦于以下问题:C++ SpiderRequest::setKey方法的具体用法?C++ SpiderRequest::setKey怎么用?C++ SpiderRequest::setKey使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在SpiderRequest的用法示例。


在下文中一共展示了SpiderRequest::setKey方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。

示例1: sendPageAnalyze


//.........这里部分代码省略.........
	long isXml = r->getLong("xml",0);

	// if got docid, use that
	if ( st->m_docId != -1 ) {
		if ( ! xd->set3 ( st->m_docId,
				  st->m_coll,
				  0 ) ) // niceness
			// return error reply if g_errno is set
			return sendErrorReply ( st , g_errno );
		// make this our callback in case something blocks
		xd->setCallback ( st , gotXmlDoc );
		xd->m_pbuf = &st->m_wbuf;
		// reset this flag
		st->m_donePrinting = false;
		// . set xd from the old title rec if recycle is true
		// . can also use XmlDoc::m_loadFromOldTitleRec flag
		//if ( st->m_recycle ) xd->m_recycleContent = true;
		xd->m_recycleContent = true;
		// force this on
		//xd->m_useSiteLinkBuf = true;
		//xd->m_usePageLinkBuf = true;
		if ( isXml ) xd->m_printInXml = true;
		// now tell it to fetch the old title rec
		if ( ! xd->loadFromOldTitleRec () )
			// return false if this blocks
			return false;
		return gotXmlDoc ( st );
	}			

	// set this up
	SpiderRequest sreq;
	sreq.reset();
	if ( st->m_u ) strcpy(sreq.m_url,st->m_u);
	long firstIp = hash32n(st->m_u);
	if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
	// parentdocid of 0
	sreq.setKey( firstIp, 0LL, false );
	sreq.m_isPageParser = 1;
	sreq.m_hopCount = st->m_hopCount;
	sreq.m_hopCountValid = 1;
	sreq.m_fakeFirstIp   = 1;
	sreq.m_firstIp = firstIp;
	Url nu;
	nu.set(sreq.m_url);
	sreq.m_domHash32 = nu.getDomainHash32();
	sreq.m_siteHash32 = nu.getHostHash32();

	// . get provided content if any
	// . will be NULL if none provided
	// . "content" may contain a MIME
	long  contentLen = 0;
	char *content = r->getString ( "content" , &contentLen , NULL );
	// is the "content" url-encoded? default is true.
	bool contentIsEncoded = true;
	// mark doesn't like to url-encode his content
	if ( ! content ) {
		content    = r->getUnencodedContent    ();
		contentLen = r->getUnencodedContentLen ();
		contentIsEncoded = false;
	}
	// ensure null
	if ( contentLen == 0 ) content = NULL;

	//uint8_t contentType = CT_HTML;
	//if ( isXml ) contentType = CT_XML;
	long ctype = r->getLong("ctype",CT_HTML);

	// . use the enormous power of our new XmlDoc class
	// . this returns false if blocked
	if ( ! xd->set4 ( &sreq       ,
			  NULL        ,
			  st->m_coll  ,
			  // we need this so the term table is set!
			  &st->m_wbuf        , // XmlDoc::m_pbuf
			  0, // try 0 now! 1 ,//PP_NICENESS ))
			  content ,
			  false, // deletefromindex
			  0, // forced ip
			  ctype ))
		// return error reply if g_errno is set
		return sendErrorReply ( st , g_errno );
	// make this our callback in case something blocks
	xd->setCallback ( st , gotXmlDoc );
	// reset this flag
	st->m_donePrinting = false;
	// prevent a core here in the event we download the page content
	xd->m_crawlDelayValid = true;
	xd->m_crawlDelay = 0;
	// . set xd from the old title rec if recycle is true
	// . can also use XmlDoc::m_loadFromOldTitleRec flag
	//if ( st->m_recycle ) xd->m_recycleContent = true;
	// only recycle if docid is given!!
	if ( st->m_recycle ) xd->m_recycleContent = true;
	// force this on
	//xd->m_useSiteLinkBuf = true;
	//xd->m_usePageLinkBuf = true;
	if ( isXml ) xd->m_printInXml = true;

	return gotXmlDoc ( st );
}
开发者ID:acchou,项目名称:open-source-search-engine,代码行数:101,代码来源:PageParser.cpp

示例2: sendPageParser2


//.........这里部分代码省略.........
			  "</textarea>"
			  "</td>"
			  "</tr>"

		  "</table>"
		  "</center>"
		  "</form>"
		  "<br>",

			  //oips ,
			  contentParm );



	xbuf->safePrintf(
			 "<center>"
			 "<input type=submit value=Submit>"
			 "</center>"
			 );


	// just print the page if no url given
	if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st );


	XmlDoc *xd = &st->m_xd;
	// set this up
	SpiderRequest sreq;
	sreq.reset();
	strcpy(sreq.m_url,st->m_u);
	long firstIp = hash32n(st->m_u);
	if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
	// parentdocid of 0
	sreq.setKey( firstIp, 0LL, false );
	sreq.m_isPageParser = 1;
	sreq.m_hopCount = st->m_hopCount;
	sreq.m_hopCountValid = 1;
	sreq.m_fakeFirstIp   = 1;
	sreq.m_firstIp = firstIp;
	Url nu;
	nu.set(sreq.m_url);
	sreq.m_domHash32 = nu.getDomainHash32();
	sreq.m_siteHash32 = nu.getHostHash32();

	// . get provided content if any
	// . will be NULL if none provided
	// . "content" may contain a MIME
	long  contentLen = 0;
	char *content = r->getString ( "content" , &contentLen , NULL );
	// is the "content" url-encoded? default is true.
	bool contentIsEncoded = true;
	// mark doesn't like to url-encode his content
	if ( ! content ) {
		content    = r->getUnencodedContent    ();
		contentLen = r->getUnencodedContentLen ();
		contentIsEncoded = false;
	}
	// ensure null
	if ( contentLen == 0 ) content = NULL;

	uint8_t contentType = CT_HTML;
	if ( r->getBool("xml",0) ) contentType = CT_XML;

	contentType = r->getLong("ctype",contentType);//CT_HTML);

开发者ID:acchou,项目名称:open-source-search-engine,代码行数:65,代码来源:PageParser.cpp

示例3:

// . "uf" is printf url format to scrape with a %s for the query
// . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0";
bool Msg7::scrapeQuery ( ) {

	// advance round now in case we return early
	m_round++;

	// error?
	if ( m_qbuf.length() > 500 ) {
		g_errno = EQUERYTOOBIG;
		return true;
	}

	// first encode the query
	SafeBuf ebuf;
	ebuf.urlEncode ( m_qbuf.getBufStart() ); // queryUNEncoded );

	char *uf;
	if ( m_round == 1 )
		// set to 1 for debugging
		uf="http://www.google.com/search?num=20&"
			"q=%s&scoring=d&filter=0";
		//uf = "https://startpage.com/do/search?q=%s";
		//uf = "http://www.google.com/"
		//	"/cse?cx=013269018370076798483%3A8eec3papwpi&"
		//	"ie=UTF-8&q=%s&"
		//	"num=20";
	else
		uf="http://www.bing.com/search?q=%s";

	// skip bing for now
	//if ( m_round == 2 )
	//	return true;
	//if ( m_round == 1 )
	//	return true;
		
	// make the url we will download
	char ubuf[2048];
	sprintf ( ubuf , uf , ebuf.getBufStart() );

	// log it
	log("inject: SCRAPING %s",ubuf);

	SpiderRequest sreq;
	sreq.reset();
	// set the SpiderRequest
	strcpy(sreq.m_url, ubuf);
	// . tell it to only add the hosts of each outlink for now!
	// . that will be passed on to when XmlDoc calls Links::set() i guess
	// . xd will not reschedule the scraped url into spiderdb either
	sreq.m_isScraping = 1;
	sreq.m_fakeFirstIp = 1;
	long firstIp = hash32n(ubuf);
	if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
	sreq.m_firstIp = firstIp;
	// parent docid is 0
	sreq.setKey(firstIp,0LL,false);

	// forceDEl = false, niceness = 0
	m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 ); 

	//m_xd.m_isScraping = true;

	// download without throttling
	//m_xd.m_throttleDownload = false;

	// disregard this
	m_xd.m_useRobotsTxt = false;

	// this will tell it to index ahrefs first before indexing
	// the doc. but do NOT do this if we are from ahrefs.com
	// ourselves to avoid recursive explosion!!
	if ( m_useAhrefs )
		m_xd.m_useAhrefs = true;

	m_xd.m_reallyInjectLinks = m_injectLinks;

	//
	// rather than just add the links of the page to spiderdb,
	// let's inject them!
	//
	m_xd.setCallback ( this , doneInjectingLinksWrapper );

	// niceness is 0
	m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2");

	// do we actually inject the links, or just scrape?
	if ( ! m_xd.injectLinks ( &m_linkDedupTable ,
				  NULL,
				  this , 
				  doneInjectingLinksWrapper ) ) 
		return false;
	// otherwise, just download the google/bing search results so we
	// can display them in xml
	//else if ( m_xd.getUtf8Content() == (char **)-1 )
	//	return false;
		
	// print reply..
	//printReply();
	return true;
//.........这里部分代码省略.........
开发者ID:BKJackson,项目名称:open-source-search-engine,代码行数:101,代码来源:PageInject.cpp

示例4: void

bool Msg7::inject ( char *url ,
		    long  forcedIp ,
		    char *content ,
		    long  contentLen ,
		    bool  recycleContent,
		    uint8_t contentType,
		    char *coll ,
		    bool  quickReply ,
		    char *username ,
		    char *pwd ,
		    long  niceness,
		    void *state ,
		    void (*callback)(void *state),
		    long firstIndexed,
		    long lastSpidered,
		    long hopCount,
		    char newOnly,
		    short charset,
		    char spiderLinks,
		    char deleteIt,
		    char hasMime,
		    bool doConsistencyTesting
		    ) {

	m_quickReply = quickReply;

	// store coll
	if ( ! coll ) { g_errno = ENOCOLLREC; return true; }
        long collLen = gbstrlen ( coll );
	if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
	strncpy ( m_coll , coll , collLen );
	m_coll [ collLen ] = '\0';

	// store user
	//long ulen = 0;
	//if ( username ) ulen = gbstrlen(username);
	//if ( ulen >= MAX_USER_SIZE-1 ) {g_errno = EBUFOVERFLOW; return true;}
	//if ( username ) strcpy( m_username, username );

	// store password
	//long pwdLen = 0;
	//if ( pwd ) pwdLen = gbstrlen(pwd);
	//m_pwd [ 0 ] ='\0';
	//if ( pwdLen > 31 ) pwdLen = 31;
	//if ( pwdLen > 0 ) strncpy ( m_pwd , pwd , pwdLen );
	//m_pwd [ pwdLen ] = '\0';

	// store url
	if ( ! url ) { g_errno = 0; return true; }
	long urlLen = gbstrlen(url);
	if ( urlLen > MAX_URL_LEN ) {g_errno = EBADENGINEER; return true; }
	// skip injecting if no url given! just print the admin page.
	if ( urlLen <= 0 ) return true;
	//strcpy ( m_url , url );

	if ( g_repairMode ) { g_errno = EREPAIRING; return true; }

	// send template reply if no content supplied
	if ( ! content && ! recycleContent ) {
		log("inject: no content supplied to inject command and "
		    "recycleContent is false.");
		//return true;
	}

	// clean url?
	// normalize and add www. if it needs it
	Url uu;
	uu.set ( url , gbstrlen(url) , true );
	// remove >'s i guess and store in st1->m_url[] buffer
	char cleanUrl[MAX_URL_LEN+1];
	urlLen = cleanInput ( cleanUrl,
			      MAX_URL_LEN, 
			      uu.getUrl(),
			      uu.getUrlLen() );


	// this can go on the stack since set4() copies it
	SpiderRequest sreq;
	sreq.reset();
	strcpy(sreq.m_url, cleanUrl );
	// parentdocid of 0
	long firstIp = hash32n(cleanUrl);
	if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
	sreq.setKey( firstIp,0LL, false );
	sreq.m_isInjecting   = 1; 
	sreq.m_isPageInject  = 1;
	sreq.m_hopCount      = hopCount;
	sreq.m_hopCountValid = 1;
	sreq.m_fakeFirstIp   = 1;
	sreq.m_firstIp       = firstIp;

	// shortcut
	XmlDoc *xd = &m_xd;

	// log it now
	//log("inject: injecting doc %s",cleanUrl);

	static char s_dummy[3];
	// sometims the content is indeed NULL...
	if ( newOnly && ! content ) { 
//.........这里部分代码省略.........
开发者ID:BKJackson,项目名称:open-source-search-engine,代码行数:101,代码来源:PageInject.cpp

示例5: gotPhrase

void Scraper::gotPhrase ( ) {
	// error getting random phrase? bail!
	if ( g_errno ) log("scraper: got error getting random phrase: %s",
			   mstrerror(g_errno));

	CollectionRec *cr = g_collectiondb.getRec ( m_coll );

 loop:
	// what type of query should we do?
	m_qtype = rand() % 3;

	// make sure web, news, blog is enabled
	if ( m_qtype == 0 && ! cr->m_scrapingEnabledWeb   ) goto loop;
	if ( m_qtype == 1 && ! cr->m_scrapingEnabledNews  ) goto loop;
	if ( m_qtype == 2 && ! cr->m_scrapingEnabledBlogs ) goto loop;

	// scraping is off when repairing obviously
	if ( g_repairMode ) return;

	// get it
	char *s = g_wiki.m_randPhrase;
	// convert _'s to spaces
	for ( char *p = s ; *p ; p++ )
		if ( *p == '_' ) *p = ' ';
	// . url encode the random phrase
	// . truncate it to 200 bytes to keep things sane
	// . Wiki::doneReadingWiki() keeps it below 128 i think anyway
	char qe[400];
	urlEncode(qe, 200, s , gbstrlen(s) );
	char *end = qe + 390;

	// half the time append a random word from dictionary so that we 
	// discovery those tail-end sites better
	if ( m_qtype == 0 && (rand() % 2) ) { 
		// point into it for appending
		char *p = qe + gbstrlen(qe);
		// add a space, url encoded
		*p++ = '+';
		// append a random word to it from dictionary
		char *rw = g_speller.getRandomWord();
		// append that in
		urlEncode( p , end - p - 1 , rw , gbstrlen(rw) );
	}

	// make a query to scrape
	char buf[2048];

	char *uf ;
	if      ( m_qtype == 0 )
		uf="http://www.google.com/search?num=50&q=%s&scoring=d"
			"&filter=0";
	// google news query? sort by date.
	else if ( m_qtype == 1 )
		uf="http://news.google.com/news?num=50&q=%s&sort=n"
			"&filter=0";
	// google blog query?
	else if ( m_qtype == 2 ) 
		uf="http://www.google.com/blogsearch?num=50&q=%s&scoring=d"
			"&filter=0";
	// sanity check
	else { char *xx=NULL;*xx=0; }

	// make the url we will download
	sprintf ( buf , uf , qe );

	SpiderRequest sreq;
	// set the SpiderRequest
	strcpy(sreq.m_url, uf);
	// . tell it to only add the hosts of each outlink for now!
	// . that will be passed on to when XmlDoc calls Links::set() i guess
	// . xd will not reschedule the scraped url into spiderdb either
	sreq.m_isScraping = 1;
	sreq.m_fakeFirstIp = 1;
	long firstIp = hash32n(uf);
	if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
	sreq.m_firstIp = firstIp;
	// parent docid is 0
	sreq.setKey(firstIp,0LL,false);

	// forceDEl = false, niceness = 0
	m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 ); 

	//m_xd.m_isScraping = true;

	// download without throttling
	//m_xd.m_throttleDownload = false;

	// disregard this
	m_xd.m_useRobotsTxt = false;

	// call this when index completes
	m_xd.setCallback ( NULL , indexedDocWrapper );

	// assume it blocked
	m_numSent++;

	// scraper is special
	m_xd.m_usePosdb     = false;
	m_xd.m_useDatedb    = false;
	m_xd.m_useClusterdb = false;
//.........这里部分代码省略.........
开发者ID:BKJackson,项目名称:open-source-search-engine,代码行数:101,代码来源:Scraper.cpp

示例6: sendPageAddUrl


//.........这里部分代码省略.........
	// . should we force it into spiderdb even if already in there
	// . use to manually update spider times for a url
	// . however, will not remove old scheduled spider times
	// . mdw: made force on the default
	st1->m_forceRespider = r->getLong("force",1); // 0);

	long now = getTimeGlobal();

	// . allow 1 submit every 1 hour
	// . restrict by submitter domain ip
	if ( ! st1->m_isAdmin &&
	     ! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) {
		// return error page
		g_errno = ETOOEARLY;
		return sendReply ( st1 , true );
	}


	//st1->m_query = r->getString( "qts", &st1->m_queryLen );


	// check it, if turing test is enabled for this collection
	if ( ! st1->m_isAdmin && cr->m_doTuringTest && 
	     ! g_turingTest.isHuman(r) )  {
		// log note so we know it didn't make it
		g_msg = " (error: bad answer)";
		//log("PageAddUrl:: addurl failed for %s : bad answer",
		//    iptoa(s->m_ip));
		st1->m_goodAnswer = false;
		return sendReply ( st1 , true /*addUrl enabled?*/ );
	}

	//if ( st1->m_queryLen > 0 )
	//	return getPages( st1 );

	// if no url given, just print a blank page
	if ( ! url ) return sendReply (  st1 , true );


	//
	// make a SpiderRequest
	//

	SpiderRequest *sreq = &st1->m_sreq;
	// reset it
	sreq->reset();
	// make the probable docid
	long long probDocId = g_titledb.getProbableDocId ( st1->m_url );
	// make one up, like we do in PageReindex.cpp
	long firstIp = (probDocId & 0xffffffff);
	// . now fill it up
	// . TODO: calculate the other values... lazy!!! (m_isRSSExt, 
	//         m_siteNumInlinks,...)
	sreq->m_isNewOutlink = 1;
	sreq->m_isAddUrl     = 1;
	sreq->m_addedTime    = now;
	sreq->m_fakeFirstIp   = 1;
	sreq->m_probDocId     = probDocId;
	sreq->m_firstIp       = firstIp;
	sreq->m_hopCount      = 0;
	// its valid if root
	Url uu; uu.set ( st1->m_url );
	if ( uu.isRoot() ) sreq->m_hopCountValid = true;
	// too big?
	//long len = st1->m_urlLen;
	// the url! includes \0
	strcpy ( sreq->m_url , st1->m_url );
	// call this to set sreq->m_dataSize now
	sreq->setDataSize();
	// make the key dude -- after setting url
	sreq->setKey ( firstIp , 0LL, false );
	// need a fake first ip lest we core!
	//sreq->m_firstIp = (pdocId & 0xffffffff);
	// how to set m_firstIp? i guess addurl can be throttled independently
	// of the other urls???  use the hash of the domain for it!
	long  dlen;
	char *dom = getDomFast ( st1->m_url , &dlen );
	// fake it for this...
	//sreq->m_firstIp = hash32 ( dom , dlen );
	// sanity
	if ( ! dom ) {
		g_errno = EBADURL;
		return sendReply ( st1 , true );
	}
	// shortcut
	Msg4 *m = &st1->m_msg4;
	// now add that to spiderdb using msg4
	if ( ! m->addMetaList ( (char *)sreq    ,
				sreq->getRecSize() ,
				coll            ,
				st1             , // state
				addedStuff      ,
				MAX_NICENESS    ,
				RDB_SPIDERDB    ) )
		// we blocked
		return false;

	// send back the reply
	return sendReply ( st1 , true );
}
开发者ID:RevBooyah,项目名称:open-source-search-engine,代码行数:101,代码来源:PageAddUrl.cpp


注:本文中的SpiderRequest::setKey方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。