本文整理汇总了C#中HtmlAgilityPack.HtmlDocument.ExtractContent方法的典型用法代码示例。如果您正苦于以下问题:C# HtmlDocument.ExtractContent方法的具体用法?C# HtmlDocument.ExtractContent怎么用?C# HtmlDocument.ExtractContent使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类HtmlAgilityPack.HtmlDocument
的用法示例。
在下文中一共展示了HtmlDocument.ExtractContent方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C#代码示例。
示例1: Start
public void Start(Database database)
{
var lastArticle = database.ExecuteReader<Article>("SELECT * FROM Article ORDER BY ArticleId DESC LIMIT 1");
if (lastArticle.Count == 0)
{
_lastCrawledArticleId = 0;
}
else
{
_lastCrawledArticleId = lastArticle[0].ArticleId;
}
var lastArticleId = GetLastArticleId();
while (true)
{
// crawling 해야 할 글 지정
var nextArticleId = _lastCrawledArticleId + 1;
// 웹사이트 주소 구성
var targetUrl = MakeArticleUrl(CategoryId, nextArticleId);
try
{
// 웹사이트 긁기 - 5초 이후 timeout
var rawHtml = targetUrl.CrawlIt(Encoding.GetEncoding(51949), 5000);
// 원하는 내용 추출
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(rawHtml);
var article = new Article
{
CategoryId = CategoryId,
ArticleId = nextArticleId,
RawHtml = rawHtml,
CrawlingTime = DateTime.Now,
};
article.IsDeleted = htmlDoc.IsDeletedArticle();
if (!article.IsDeleted)
{
article.Author = htmlDoc.ExtractAutor();
article.WriteTime = htmlDoc.ExtractWrittenTime();
article.Title = htmlDoc.ExtractTitle();
article.Content = htmlDoc.ExtractContent();
}
// 데이터베이스에 저장
database.SyncData<Article>(article);
}
catch (Exception ex)
{
LogHelper.Log(new Exception(targetUrl));
LogHelper.Log(ex);
}
// 최신 글인지 확인
while (nextArticleId == lastArticleId)
{
lastArticleId = GetLastArticleId();
if (nextArticleId == lastArticleId)
{
// 글이 없을 경우 스레드 10분간 휴식
Thread.Sleep(10 * 60 * 1000);
}
}
// 각 글을 crawling 한 후 3초간 휴식
Thread.Sleep(3 * 1000);
_lastCrawledArticleId++;
}
}