本文整理汇总了C#中Abot.Poco.PageToCrawl类的典型用法代码示例。如果您正苦于以下问题:C# PageToCrawl类的具体用法?C# PageToCrawl怎么用?C# PageToCrawl使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
PageToCrawl类属于Abot.Poco命名空间,在下文中一共展示了PageToCrawl类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C#代码示例。
示例1: ShouldCrawlPage
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
{
if(pageToCrawl == null)
return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };
if (crawlContext == null)
return new CrawlDecision { Allow = false, Reason = "Null crawl context" };
if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };
if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };
if (crawlContext.CrawledCount + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
{
return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
}
int pagesCrawledInThisDomain = 0;
if (crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
pagesCrawledInThisDomain > 0)
{
if(pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
}
if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
return new CrawlDecision { Allow = false, Reason = "Link is external" };
return new CrawlDecision { Allow = true };
}
示例2: PageCrawlStartingArgs
public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl)
: base(crawlContext)
{
if (pageToCrawl == null)
throw new ArgumentNullException("pageToCrawl");
PageToCrawl = pageToCrawl;
}
示例3: PageCrawlDisallowedArgs
public PageCrawlDisallowedArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl, string disallowedReason)
: base(crawlContext, pageToCrawl)
{
if (string.IsNullOrWhiteSpace(disallowedReason))
throw new ArgumentNullException("disallowedReason");
DisallowedReason = disallowedReason;
}
示例4: SetUp
public void SetUp()
{
_page = new PageToCrawl { Uri = new Uri("http://a.com/") };
_pages = new List<PageToCrawl> { new PageToCrawl { Uri = new Uri("http://a.com/") }, new PageToCrawl { Uri = new Uri("http://b.com/") } };
_fakeCrawledUrlRepo = new Mock<ICrawledUrlRepository>();
_fakePagesToCrawlRepo = new Mock<IPagesToCrawlRepository>();
_unitUnderTest = new Scheduler(false, _fakeCrawledUrlRepo.Object, _fakePagesToCrawlRepo.Object);
}
示例5: Constructor_ValidUri_CreatesInstance
public void Constructor_ValidUri_CreatesInstance()
{
PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/"));
Assert.AreEqual(false, unitUnderTest.IsRetry);
Assert.AreEqual(false, unitUnderTest.IsRoot);
Assert.AreEqual(false, unitUnderTest.IsInternal);
Assert.AreEqual(null, unitUnderTest.ParentUri);
Assert.AreEqual("http://a.com/", unitUnderTest.Uri.AbsoluteUri);
Assert.AreEqual(0, unitUnderTest.CrawlDepth);
}
示例6: ConvertToPageToCrawl
public virtual PageToCrawl ConvertToPageToCrawl(LinkToCrawl link, int crawlerId)
{
var page = new PageToCrawl(new Uri(link.TargetUrl));
page.PageBag.SessionId = link.SessionId;
page.PageBag.CrawlerId = crawlerId;
page.ParentUri = new Uri(link.SourceUrl);
page.CrawlDepth = link.CrawlDepth;
page.IsInternal = link.IsInternal;
page.IsRoot = link.IsRoot;
return page;
}
示例7: Constructor_CreatesInstance
public void Constructor_CreatesInstance()
{
PageToCrawl unitUnderTest = new PageToCrawl();
Assert.AreEqual(false, unitUnderTest.IsRetry);
Assert.AreEqual(false, unitUnderTest.IsRoot);
Assert.AreEqual(false, unitUnderTest.IsInternal);
Assert.AreEqual(null, unitUnderTest.ParentUri);
Assert.IsNull(unitUnderTest.Uri);
Assert.AreEqual(0, unitUnderTest.CrawlDepth);
Assert.IsNull(unitUnderTest.PageBag);
}
示例8: ConvertToLinkToCrawl
public virtual LinkToCrawl ConvertToLinkToCrawl(PageToCrawl page, int sessionId)
{
var link = new LinkToCrawl();
link.SessionId = sessionId;
link.SourceUrl = page.ParentUri.AbsoluteUri;
link.TargetUrl = page.Uri.AbsoluteUri;
link.TargetBaseDomain = page.Uri.GetBaseDomain();
link.CrawlDepth = page.CrawlDepth;
link.IsRoot = page.IsRoot;
link.IsInternal = page.IsInternal;
return link;
}
示例9: Add
public static void Add(SchedulerState state, PageToCrawl page)
{
var json = JsonConvert.SerializeObject(page);
var url = page.Uri.AbsoluteUri;
var trans = CreateTransaction(state);
var crawledPageKey = CrawledPageKey(state.SiteName, url);
var pageToCrawlKey = PageToCrawlKey(state.SiteName);
trans.AddCondition(Condition.KeyNotExists(crawledPageKey));
trans.StringSetAsync(crawledPageKey, "");
trans.ListLeftPushAsync(pageToCrawlKey, json);
trans.ExecuteAsync().Wait();
}
示例10: PageBag
public void PageBag()
{
PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/"));
unitUnderTest.PageBag.SomeVal = "someval";
unitUnderTest.PageBag.SomeQueue = new Queue<string>();
unitUnderTest.PageBag.SomeQueue.Enqueue("aaa");
unitUnderTest.PageBag.SomeQueue.Enqueue("bbb");
Assert.IsNotNull(unitUnderTest.PageBag);
Assert.AreEqual("someval", unitUnderTest.PageBag.SomeVal);
Assert.AreEqual("aaa", unitUnderTest.PageBag.SomeQueue.Dequeue());
Assert.AreEqual("bbb", unitUnderTest.PageBag.SomeQueue.Dequeue());
}
示例11: Add
public void Add(PageToCrawl page)
{
if (page == null)
throw new ArgumentNullException("page");
if (_allowUriRecrawling || page.IsRetry)
{
_pagesToCrawlRepo.Add(page);
}
else
{
if (_crawledUrlRepo.AddIfNew(page.Uri))
_pagesToCrawlRepo.Add(page);
}
}
示例12: ShouldCrawlPage
protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
{
bool allowedByRobots = true;
if (_robotsDotText != null)
allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
//https://github.com/sjdirect/abot/issues/96 Handle scenario where the root is allowed but all the paths below are disallowed like "disallow: /*"
var allPathsBelowRootAllowedByRobots = false;
if (_robotsDotText != null && pageToCrawl.IsRoot && allowedByRobots)
{
var anyPathOffRoot = pageToCrawl.Uri.AbsoluteUri.EndsWith("/") ? pageToCrawl.Uri.AbsoluteUri + "aaaaa": pageToCrawl.Uri.AbsoluteUri + "/aaaaa";
allPathsBelowRootAllowedByRobots = _robotsDotText.IsUrlAllowed(anyPathOffRoot, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
}
if (_crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled && pageToCrawl.IsRoot)
{
if (!allowedByRobots)
{
string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
_logger.DebugFormat(message);
allowedByRobots = true;
_robotsDotText = null;
}
else if (!allPathsBelowRootAllowedByRobots)
{
string message = string.Format("All Pages below [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
_logger.DebugFormat(message);
allowedByRobots = true;
_robotsDotText = null;
}
}
else if (!allowedByRobots)
{
string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
_logger.DebugFormat(message);
FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
FirePageCrawlDisallowedEvent(pageToCrawl, message);
return false;
}
return allowedByRobots && base.ShouldCrawlPage(pageToCrawl);
}
示例13: Add
/// <summary>
/// If this method is called, then it assumes some pre-logic for links to avoid has already
/// been applied and that the <paramref name="page"/> should be stored for future crawling.
/// </summary>
/// <param name="page"></param>
public void Add(PageToCrawl page)
{
if (page == null)
throw new ArgumentNullException("page");
//_logger.DebugFormat("Add(page): Target: {0}, Source: {1}, Root: {2}",
// page.Uri.AbsoluteUri,
// page.ParentUri.AbsoluteUri,
// page.IsRoot);
page.PageBag.SessionId = SessionId;
page.PageBag.CrawlerId = CrawlerId;
using (var factory = _provider.GetInstanceOf<IModelFactory>())
{
var link = factory.ConvertToLinkToCrawl(page, SessionId);
AddLinkToCrawl(link);
}
}
示例14: Add
/// <summary>
/// Schedules the param to be crawled in a FIFO fashion
/// </summary>
public void Add(PageToCrawl page)
{
if (page == null)
throw new ArgumentNullException("page");
if (_allowUriRecrawling)
{
//_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri);
_pagesToCrawl.Enqueue(page);
}
else
{
if (_scheduledOrCrawled.TryAdd(page.Uri.AbsoluteUri, null))
{
//_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri);
_pagesToCrawl.Enqueue(page);
}
}
}
示例15: ShouldCrawlPage
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
{
if(pageToCrawl == null)
return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };
if (crawlContext == null)
return new CrawlDecision { Allow = false, Reason = "Null crawl context" };
if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects)
return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) };
if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };
if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };
//TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)?
if (!pageToCrawl.IsRetry &&
crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
{
return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
}
int pagesCrawledInThisDomain = 0;
if (!pageToCrawl.IsRetry &&
crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
pagesCrawledInThisDomain > 0)
{
if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
}
if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
return new CrawlDecision { Allow = false, Reason = "Link is external" };
return new CrawlDecision { Allow = true };
}