本文整理汇总了C++中Url::getDomainLen方法的典型用法代码示例。如果您正苦于以下问题:C++ Url::getDomainLen方法的具体用法?C++ Url::getDomainLen怎么用?C++ Url::getDomainLen使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Url
的用法示例。
在下文中一共展示了Url::getDomainLen方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: init
// init our rdb
bool Titledb::init ( ) {
// key sanity tests
int64_t uh48 = 0x1234567887654321LL & 0x0000ffffffffffffLL;
int64_t docId = 123456789;
key96_t k = makeKey(docId,uh48,false);
if ( getDocId(&k) != docId ) { g_process.shutdownAbort(true);}
if ( getUrlHash48(&k) != uh48 ) { g_process.shutdownAbort(true);}
const char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html";
Url uu;
uu.set(url);
const char *d1 = uu.getDomain();
int32_t dlen1 = uu.getDomainLen();
int32_t dlen2 = 0;
const char *d2 = getDomFast ( url , &dlen2 );
if ( !d1 || !d2 ) { g_process.shutdownAbort(true); }
if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }
// another one
url = "http://ok/";
uu.set(url);
const char *d1a = uu.getDomain();
dlen1 = uu.getDomainLen();
dlen2 = 0;
const char *d2a = getDomFast ( url , &dlen2 );
if ( d1a || d2a ) { g_process.shutdownAbort(true); }
if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }
// . what's max # of tree nodes?
// . assume avg TitleRec size (compressed html doc) is about 1k we get:
// . NOTE: overhead is about 32 bytes per node
int32_t maxTreeNodes = g_conf.m_titledbMaxTreeMem / (1*1024);
// initialize our own internal rdb
return m_rdb.init ( "titledb" ,
-1 , // fixed record size
//g_conf.m_titledbMinFilesToMerge ,
// this should not really be changed...
-1,
g_conf.m_titledbMaxTreeMem ,
maxTreeNodes ,
false, // half keys?
12, // key size
false, //isCollectionLess
false); //useIndexFile
// validate
//return verify ( );
}
示例2: setCandidates
void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
Sections *sections , XmlDoc *xd ) {
// not valid for now
m_thumbnailValid = false;
// reset our array of image node candidates
m_numImages = 0;
// flag it
m_setCalled = true;
// strange...
if ( m_imgReply ) { char *xx=NULL;*xx=0; }
// save this
m_xml = xml;
m_pageUrl = pageUrl;
// if we are a diffbot json reply, trust that diffbot got the
// best candidate, and just use that
if ( xd->m_isDiffbotJSONObject ) return;
//m_pageSite = pageSite;
// scan the words
long nw = words->getNumWords();
nodeid_t *tids = words->getTagIds();
long long *wids = words->getWordIds();
//long *scores = scoresArg->m_scores;
Section **sp = NULL;
if ( sections ) sp = sections->m_sectionPtrs;
// not if we don't have any identified sections
if ( sections && sections->m_numSections <= 0 ) sp = NULL;
// the positive scored window
long firstPosScore = -1;
long lastPosScore = -1;
long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
// find positive scoring window
for ( long i = 0 ; i < nw ; i++ ) {
// skip if in bad section
if ( sp && (sp[i]->m_flags & badFlags) ) continue;
if ( wids[i] != 0 ) continue;
// set first positive scoring guy
if ( firstPosScore == -1 ) firstPosScore = i;
// keep track of last guy
lastPosScore = i;
}
// sanity check
if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }
// . pedal firstPosScore back until we hit a section boundary
// . i.e. stop once we hit a front/back tag pair, like <div> and </div>
char tc[512];
memset ( tc , 0 , 512 );
long a = firstPosScore;
for ( ; a >= 0 ; a-- ) {
// get the tid
nodeid_t tid = tids[a];
// remove back bit, if any
tid &= BACKBITCOMP;
// skip if not a tag, or a generic xml tag
if ( tid <= 1 ) continue;
// mark it
if ( words->isBackTag(a) ) tc[tid] |= 0x02;
else tc[tid] |= 0x01;
// continue if not a full front/back pair
if ( tc[tid] != 0x03 ) continue;
// continue if not a "section" type tag (see Scores.cpp)
if ( tid != TAG_DIV &&
tid != TAG_TEXTAREA &&
tid != TAG_TR &&
tid != TAG_TD &&
tid != TAG_TABLE )
continue;
// ok we should stop now
break;
}
// min is 0
if ( a < 0 ) a = 0;
// now look for the image urls within this window
for ( long i = a ; i < lastPosScore ; i++ ) {
// skip if not <img> tag
if (tids[i] != TAG_IMG ) continue;
// get the node num into Xml.cpp::m_nodes[] array
long nn = words->m_nodes[i];
// check width to rule out small decorating imgs
long width = xml->getLong(nn,nn+1,"width", -1 );
if ( width != -1 && width < 50 ) continue;
// same with height
long height = xml->getLong(nn,nn+1, "height", -1 );
if ( height != -1 && height < 50 ) continue;
// get the url of the image
long srcLen;
char *src = xml->getString(nn,"src",&srcLen);
// skip if none
if ( srcLen <= 2 ) continue;
// set it to the full url
Url iu;
// use "pageUrl" as the baseUrl
iu.set ( pageUrl , src , srcLen );
// skip if invalid domain or TLD
if ( iu.getDomainLen() <= 0 ) continue;
// skip if not from same domain as page url
//long dlen = pageUrl->getDomainLen();
//if ( iu.getDomainLen() != dlen ) continue;
//.........这里部分代码省略.........
示例3: setCandidates
void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
Sections *sections , XmlDoc *xd ) {
// not valid for now
m_thumbnailValid = false;
// reset our array of image node candidates
m_numImages = 0;
// flag it
m_setCalled = true;
// strange...
if ( m_imgReply ) { char *xx=NULL;*xx=0; }
// save this
m_xml = xml;
m_pageUrl = pageUrl;
//
// first add any open graph candidate.
// basically they page telling us the best image straight up.
//
int32_t node2 = -1;
int32_t startNode = 0;
// . field can be stuff like "summary","description","keywords",...
// . if "convertHtmlEntites" is true we change < to < and > to >
// . <meta property="og:image" content="http://example.com/rock2.jpg"/>
// . <meta property="og:image" content="http://example.com/rock3.jpg"/>
ogimgloop:
char ubuf[2000];
int32_t ulen = xml->getMetaContent( ubuf, 1999, "og:image", 8, "property", startNode, &node2 );
// update this in case goto ogimgloop is called
startNode = node2 + 1;
// see section below for explanation of what we are storing here...
if ( node2 >= 0 ) {
// save it
m_imageNodes[m_numImages] = node2;
Query q;
if ( ulen > MAX_URL_LEN ) goto ogimgloop;
// set it to the full url
Url iu;
// use "pageUrl" as the baseUrl
iu.set( pageUrl, ubuf, ulen );
// skip if invalid domain or TLD
if ( iu.getDomainLen() <= 0 ) goto ogimgloop;
// for looking it up on disk to see if unique or not
char buf[2000];
// if we don't put in quotes it expands '|' into
// the "PiiPe" operator in Query.cpp
snprintf ( buf , 1999, "gbimage:\"%s\"",iu.getUrl());
// TODO: make sure this is a no-split termid storage thingy
// in Msg14.cpp
if ( ! q.set2 ( buf , langUnknown , false ) ) return;
// sanity test
if ( q.getNumTerms() != 1 ) { char *xx=0;*xx=0; }
// store the termid
m_termIds[m_numImages] = q.getTermId(0);
// advance the counter
m_numImages++;
// try to get more graph images if we have some room
if ( m_numImages + 2 < MAX_IMAGES ) goto ogimgloop;
}
//m_pageSite = pageSite;
// scan the words
int32_t nw = words->getNumWords();
nodeid_t *tids = words->getTagIds();
int64_t *wids = words->getWordIds();
//int32_t *scores = scoresArg->m_scores;
Section **sp = NULL;
if ( sections ) sp = sections->m_sectionPtrs;
// not if we don't have any identified sections
if ( sections && sections->m_numSections <= 0 ) sp = NULL;
// the positive scored window
int32_t firstPosScore = -1;
int32_t lastPosScore = -1;
int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
// find positive scoring window
for ( int32_t i = 0 ; i < nw ; i++ ) {
// skip if in bad section
if ( sp && (sp[i]->m_flags & badFlags) ) continue;
if ( wids[i] != 0 ) continue;
// set first positive scoring guy
if ( firstPosScore == -1 ) firstPosScore = i;
// keep track of last guy
lastPosScore = i;
}
// sanity check
if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }
// . pedal firstPosScore back until we hit a section boundary
// . i.e. stop once we hit a front/back tag pair, like <div> and </div>
char tc[512];
memset ( tc , 0 , 512 );
int32_t a = firstPosScore;
for ( ; a >= 0 ; a-- ) {
// get the tid
nodeid_t tid = tids[a];
// remove back bit, if any
tid &= BACKBITCOMP;
//.........这里部分代码省略.........
示例4: init
// init our rdb
bool Titledb::init ( ) {
// key sanity tests
int64_t uh48 = 0x1234567887654321LL & 0x0000ffffffffffffLL;
int64_t docId = 123456789;
key_t k = makeKey(docId,uh48,false);
if ( getDocId(&k) != docId ) { char *xx=NULL;*xx=0;}
if ( getUrlHash48(&k) != uh48 ) { char *xx=NULL;*xx=0;}
char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html";
Url uu;
uu.set(url);
char *d1 = uu.getDomain();
int32_t dlen1 = uu.getDomainLen();
int32_t dlen2 = 0;
char *d2 = getDomFast ( url , &dlen2 );
if ( dlen1 != dlen2 ) { char *xx=NULL;*xx=0; }
// another one
url = "http://ok/";
uu.set(url);
d1 = uu.getDomain();
dlen1 = uu.getDomainLen();
dlen2 = 0;
d2 = getDomFast ( url , &dlen2 );
if ( dlen1 != dlen2 ) { char *xx=NULL;*xx=0; }
int64_t maxMem = 200000000; // 200MB
// . what's max # of tree nodes?
// . assume avg TitleRec size (compressed html doc) is about 1k we get:
// . NOTE: overhead is about 32 bytes per node
int32_t maxTreeNodes = maxMem / (1*1024);
// . we now use a disk page cache for titledb as opposed to the
// old rec cache. i am trying to do away with the Rdb::m_cache rec
// cache in favor of cleverly used disk page caches, because
// the rec caches are not real-time and get stale.
// . just hard-code 30MB for now
int32_t pcmem = 30000000; // = g_conf.m_titledbMaxDiskPageCacheMem;
// fuck that we need all the mem!
//pcmem = 0;
// do not use any page cache if doing tmp cluster in order to
// prevent swapping
if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// init the page cache
// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
if ( ! m_pc.init ( "titledb",
RDB_TITLEDB,
pcmem ,
pageSize ) )
return log("db: Titledb init failed.");
// each entry in the cache is usually just a single record, no lists
//int32_t maxCacheNodes = g_conf.m_titledbMaxCacheMem / (10*1024);
// initialize our own internal rdb
if ( ! m_rdb.init ( g_hostdb.m_dir ,
"titledb" ,
true , // dedup same keys?
-1 , // fixed record size
//g_hostdb.m_groupMask ,
//g_hostdb.m_groupId ,
//g_conf.m_titledbMinFilesToMerge ,
// this should not really be changed...
-1,//3,//230 minfilestomerge mintomerge
maxMem, // g_conf.m_titledbMaxTreeMem ,
maxTreeNodes ,
// now we balance so Sync.cpp can ordered huge list
true , // balance tree?
// turn off cache for now because the page cache
// is just as fast and does not get out of date
// so bad??
//0 ,
0,//g_conf.m_titledbMaxCacheMem ,
0,//maxCacheNodes ,
false ,// half keys?
false ,// g_conf.m_titledbSav
&m_pc , // page cache ptr
true ) )// is titledb?
return false;
return true;
// validate
//return verify ( );
}
示例5: main
int main ( int argc , char *argv[] ) {
bool addWWW = true;
bool stripSession = true;
// check for arguments
for (int32_t i = 1; i < argc; i++) {
if (strcmp(argv[i], "-w") == 0)
addWWW = false;
else if (strcmp(argv[i], "-s") == 0)
stripSession = false;
}
// initialize
//g_mem.init(100*1024);
hashinit();
//g_conf.m_tfndbExtBits = 23;
loop:
// read a url from stddin
char sbuf[1024];
if ( ! fgets ( sbuf , 1024 , stdin ) ) exit(1);
char *s = sbuf;
char fbuf[1024];
// decode if we should
if ( strncmp(s,"http%3A%2F%2F",13) == 0 ||
strncmp(s,"https%3A%2F%2F",13) == 0 ) {
urlDecode(fbuf,s,gbstrlen(s));
s = fbuf;
}
// old url
printf("###############\n");
printf("old: %s",s);
int32_t slen = gbstrlen(s);
// remove any www. if !addWWW
if (!addWWW) {
if (slen >= 4 &&
strncasecmp(s, "www.", 4) == 0) {
slen -= 4;
memmove(s, &s[4], slen);
}
else {
// get past a ://
int32_t si = 0;
while (si < slen &&
( s[si] != ':' ||
s[si+1] != '/' ||
s[si+2] != '/' ) )
si++;
// remove the www.
if (si + 7 < slen) {
si += 3;
if (strncasecmp(&s[si], "www.", 4) == 0) {
slen -= 4;
memmove(&s[si], &s[si+4], slen-si);
}
}
}
}
// set it
Url u;
u.set ( s , slen ,
addWWW , /*add www?*/
stripSession ); /*strip session ids?*/
// print it
char out[1024*4];
char *p = out;
p += sprintf(p,"tld: ");
gbmemcpy ( p, u.getTLD(),u.getTLDLen());
p += u.getTLDLen();
char c = *p;
*p = '\0';
printf("%s\n",out);
*p = c;
// dom
p = out;
sprintf ( p , "dom: ");
p += gbstrlen ( p );
gbmemcpy ( p , u.getDomain() , u.getDomainLen() );
p += u.getDomainLen();
c = *p;
*p = '\0';
printf("%s\n",out);
*p = c;
// host
p = out;
sprintf ( p , "host: ");
p += gbstrlen ( p );
gbmemcpy ( p , u.getHost() , u.getHostLen() );
p += u.getHostLen();
c = *p;
*p = '\0';
printf("%s\n",out);
*p = c;
// then the whole url
printf("url: %s\n", u.getUrl() );
/*
int32_t siteLen;
char *site = u.getSite ( &siteLen , NULL , false );
if ( site ) {
c = site[siteLen];
//.........这里部分代码省略.........