当前位置: 首页>>代码示例>>C#>>正文


C# Poco.PageToCrawl类代码示例

本文整理汇总了C#中Abot.Poco.PageToCrawl的典型用法代码示例。如果您正苦于以下问题:C# PageToCrawl类的具体用法?C# PageToCrawl怎么用?C# PageToCrawl使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


PageToCrawl类属于Abot.Poco命名空间,在下文中一共展示了PageToCrawl类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C#代码示例。

示例1: ShouldCrawlPage

        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if(pageToCrawl == null)
                return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
                return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
                return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };

            if (crawlContext.CrawledCount + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
            }

            int pagesCrawledInThisDomain = 0;
            if (crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
                crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
                pagesCrawledInThisDomain > 0)
            {
                if(pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
                    return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
            }

            if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
                return new CrawlDecision { Allow = false, Reason = "Link is external" };

            return new CrawlDecision { Allow = true };
        }
开发者ID:vinchu,项目名称:abot,代码行数:33,代码来源:CrawlDecisionMaker.cs

示例2: PageCrawlStartingArgs

        public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl)
            : base(crawlContext)
        {
            if (pageToCrawl == null)
                throw new ArgumentNullException("pageToCrawl");

            PageToCrawl = pageToCrawl;
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:8,代码来源:PageCrawlStartingArgs.cs

示例3: PageCrawlDisallowedArgs

        public PageCrawlDisallowedArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl, string disallowedReason)
            : base(crawlContext, pageToCrawl)
        {
            if (string.IsNullOrWhiteSpace(disallowedReason))
                throw new ArgumentNullException("disallowedReason");

            DisallowedReason = disallowedReason;
        }
开发者ID:haigneyc,项目名称:abot,代码行数:8,代码来源:PageCrawlDisallowedArgs.cs

示例4: SetUp

        public void SetUp()
        {
            _page = new PageToCrawl { Uri = new Uri("http://a.com/") };
            _pages = new List<PageToCrawl> { new PageToCrawl { Uri = new Uri("http://a.com/") }, new PageToCrawl { Uri = new Uri("http://b.com/") } };
            _fakeCrawledUrlRepo = new Mock<ICrawledUrlRepository>();
            _fakePagesToCrawlRepo = new Mock<IPagesToCrawlRepository>();

            _unitUnderTest = new Scheduler(false, _fakeCrawledUrlRepo.Object, _fakePagesToCrawlRepo.Object);
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:9,代码来源:SchedulerTest.cs

示例5: Constructor_ValidUri_CreatesInstance

 public void Constructor_ValidUri_CreatesInstance()
 {
     PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/"));
     Assert.AreEqual(false, unitUnderTest.IsRetry);
     Assert.AreEqual(false, unitUnderTest.IsRoot);
     Assert.AreEqual(false, unitUnderTest.IsInternal);
     Assert.AreEqual(null, unitUnderTest.ParentUri);
     Assert.AreEqual("http://a.com/", unitUnderTest.Uri.AbsoluteUri);
     Assert.AreEqual(0, unitUnderTest.CrawlDepth);
 }
开发者ID:justinverhoef,项目名称:abot,代码行数:10,代码来源:PageToCrawlTest.cs

示例6: ConvertToPageToCrawl

 public virtual PageToCrawl ConvertToPageToCrawl(LinkToCrawl link, int crawlerId)
 {
     var page = new PageToCrawl(new Uri(link.TargetUrl));
     page.PageBag.SessionId = link.SessionId;
     page.PageBag.CrawlerId = crawlerId;
     page.ParentUri = new Uri(link.SourceUrl);
     page.CrawlDepth = link.CrawlDepth;
     page.IsInternal = link.IsInternal;
     page.IsRoot = link.IsRoot;
     return page;
 }
开发者ID:BgRva,项目名称:ThrongBot,代码行数:11,代码来源:ModelFactory.cs

示例7: Constructor_CreatesInstance

 public void Constructor_CreatesInstance()
 {
     PageToCrawl unitUnderTest = new PageToCrawl();
     Assert.AreEqual(false, unitUnderTest.IsRetry);
     Assert.AreEqual(false, unitUnderTest.IsRoot);
     Assert.AreEqual(false, unitUnderTest.IsInternal);
     Assert.AreEqual(null, unitUnderTest.ParentUri);
     Assert.IsNull(unitUnderTest.Uri);
     Assert.AreEqual(0, unitUnderTest.CrawlDepth);
     Assert.IsNull(unitUnderTest.PageBag);
 }
开发者ID:haigneyc,项目名称:abot,代码行数:11,代码来源:PageToCrawlTest.cs

示例8: ConvertToLinkToCrawl

 public virtual LinkToCrawl ConvertToLinkToCrawl(PageToCrawl page, int sessionId)
 {
     var link = new LinkToCrawl();
     link.SessionId = sessionId;
     link.SourceUrl = page.ParentUri.AbsoluteUri;
     link.TargetUrl = page.Uri.AbsoluteUri;
     link.TargetBaseDomain = page.Uri.GetBaseDomain();
     link.CrawlDepth = page.CrawlDepth;
     link.IsRoot = page.IsRoot;
     link.IsInternal = page.IsInternal;
     return link;
 }
开发者ID:BgRva,项目名称:ThrongBot,代码行数:12,代码来源:ModelFactory.cs

示例9: Add

		public static void Add(SchedulerState state, PageToCrawl page)
		{
			var json = JsonConvert.SerializeObject(page);
			var url = page.Uri.AbsoluteUri;
			var trans = CreateTransaction(state);
			var crawledPageKey = CrawledPageKey(state.SiteName, url);
			var pageToCrawlKey = PageToCrawlKey(state.SiteName);
			trans.AddCondition(Condition.KeyNotExists(crawledPageKey));
			trans.StringSetAsync(crawledPageKey, "");
			trans.ListLeftPushAsync(pageToCrawlKey, json);
			trans.ExecuteAsync().Wait();
		}
开发者ID:mng-au,项目名称:Abot.Redis.Scheduler,代码行数:12,代码来源:SchedulerFunc.cs

示例10: PageBag

        public void PageBag()
        {
            PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/"));
            unitUnderTest.PageBag.SomeVal = "someval";
            unitUnderTest.PageBag.SomeQueue = new Queue<string>();
            unitUnderTest.PageBag.SomeQueue.Enqueue("aaa");
            unitUnderTest.PageBag.SomeQueue.Enqueue("bbb");

            Assert.IsNotNull(unitUnderTest.PageBag);
            Assert.AreEqual("someval", unitUnderTest.PageBag.SomeVal);
            Assert.AreEqual("aaa", unitUnderTest.PageBag.SomeQueue.Dequeue());
            Assert.AreEqual("bbb", unitUnderTest.PageBag.SomeQueue.Dequeue());
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:13,代码来源:PageToCrawlTest.cs

示例11: Add

        public void Add(PageToCrawl page)
        {
            if (page == null)
                throw new ArgumentNullException("page");

            if (_allowUriRecrawling || page.IsRetry)
            {
                _pagesToCrawlRepo.Add(page);
            }
            else
            {
                if (_crawledUrlRepo.AddIfNew(page.Uri))
                    _pagesToCrawlRepo.Add(page);
            }
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:15,代码来源:Scheduler.cs

示例12: ShouldCrawlPage

        protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            bool allowedByRobots = true;
            if (_robotsDotText != null)
                allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);


            //https://github.com/sjdirect/abot/issues/96 Handle scenario where the root is allowed but all the paths below are disallowed like "disallow: /*"
            var allPathsBelowRootAllowedByRobots = false;
            if (_robotsDotText != null && pageToCrawl.IsRoot && allowedByRobots)
            {
                var anyPathOffRoot = pageToCrawl.Uri.AbsoluteUri.EndsWith("/") ? pageToCrawl.Uri.AbsoluteUri + "aaaaa": pageToCrawl.Uri.AbsoluteUri + "/aaaaa";
                allPathsBelowRootAllowedByRobots = _robotsDotText.IsUrlAllowed(anyPathOffRoot, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
            }

            if (_crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled && pageToCrawl.IsRoot)    
            {
                if (!allowedByRobots)
                {
                    string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                    _logger.DebugFormat(message);
                    allowedByRobots = true;
                    _robotsDotText = null;
                }
                else if (!allPathsBelowRootAllowedByRobots)
                {
                    string message = string.Format("All Pages below [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                    _logger.DebugFormat(message);
                    allowedByRobots = true;
                    _robotsDotText = null;
                }

            }
            else if (!allowedByRobots)
            {
                string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
                _logger.DebugFormat(message);

                FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
                FirePageCrawlDisallowedEvent(pageToCrawl, message);

                return false;
            }

            return allowedByRobots && base.ShouldCrawlPage(pageToCrawl);
        }
开发者ID:sharpcoder7,项目名称:abot,代码行数:46,代码来源:PoliteWebCrawler.cs

示例13: Add

        /// <summary>
        /// If this method is called, then it assumes some pre-logic for links to avoid has already
        /// been applied and that the <paramref name="page"/> should be stored for future crawling.
        /// </summary>
        /// <param name="page"></param>
        public void Add(PageToCrawl page)
        {
            if (page == null)
                throw new ArgumentNullException("page");

            //_logger.DebugFormat("Add(page): Target: {0}, Source: {1}, Root: {2}",
            //    page.Uri.AbsoluteUri,
            //    page.ParentUri.AbsoluteUri,
            //    page.IsRoot);

            page.PageBag.SessionId = SessionId;
            page.PageBag.CrawlerId = CrawlerId;
            using (var factory = _provider.GetInstanceOf<IModelFactory>())
            {
                var link = factory.ConvertToLinkToCrawl(page, SessionId);
                AddLinkToCrawl(link);
            }
        }
开发者ID:BgRva,项目名称:ThrongBot,代码行数:23,代码来源:MyScheduler.cs

示例14: Add

        /// <summary>
        /// Schedules the param to be crawled in a FIFO fashion
        /// </summary>
        public void Add(PageToCrawl page)
        {
            if (page == null)
                throw new ArgumentNullException("page");

            if (_allowUriRecrawling)
            {
                //_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri);
                _pagesToCrawl.Enqueue(page);
            }
            else
            {
                if (_scheduledOrCrawled.TryAdd(page.Uri.AbsoluteUri, null))
                {
                    //_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri);
                    _pagesToCrawl.Enqueue(page);
                }
            }
        }
开发者ID:haigneyc,项目名称:abot,代码行数:22,代码来源:FifoScheduler.cs

示例15: ShouldCrawlPage

        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if(pageToCrawl == null)
                return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects)
                return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) };

            if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
                return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
                return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };

            //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)?
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
                crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
            }

            int pagesCrawledInThisDomain = 0;
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
                crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
                pagesCrawledInThisDomain > 0)
            {
                if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
                    return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
            }

            if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
                return new CrawlDecision { Allow = false, Reason = "Link is external" };

            return new CrawlDecision { Allow = true };
        }
开发者ID:sharpcoder7,项目名称:abot,代码行数:40,代码来源:CrawlDecisionMaker.cs


注:本文中的Abot.Poco.PageToCrawl类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。