本文整理汇总了C#中HtmlAgilityPack.HtmlDocument.GetLinks方法的典型用法代码示例。如果您正苦于以下问题:C# HtmlDocument.GetLinks方法的具体用法?C# HtmlDocument.GetLinks怎么用?C# HtmlDocument.GetLinks使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类HtmlAgilityPack.HtmlDocument
的用法示例。
在下文中一共展示了HtmlDocument.GetLinks方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C#代码示例。
示例1: Process
public void Process(Crawler crawler, PropertyBag propertyBag)
{
AspectF.Define.
NotNull(crawler, "crawler").
NotNull(propertyBag, "propertyBag");
if (propertyBag.StatusCode != HttpStatusCode.OK)
{
return;
}
if (!IsHtmlContent(propertyBag.ContentType))
{
return;
}
HtmlDocument htmlDoc = new HtmlDocument
{
OptionAddDebuggingAttributes = false,
OptionAutoCloseOnEnd = true,
OptionFixNestedTags = true,
OptionReadEncoding = true
};
using (Stream reader = propertyBag.GetResponse())
{
Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
reader.Seek(0, SeekOrigin.Begin);
if (!documentEncoding.IsNull())
{
htmlDoc.Load(reader, documentEncoding, true);
}
else
{
htmlDoc.Load(reader, true);
}
}
string originalContent = htmlDoc.DocumentNode.OuterHtml;
if (HasTextStripRules || HasSubstitutionRules)
{
string content = StripText(originalContent);
content = Substitute(content, propertyBag.Step);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
propertyBag["HtmlDoc"].Value = htmlDoc;
HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
// Extract Title
if (!nodes.IsNull())
{
propertyBag.Title = string.Join(";", nodes.
Select(n => n.InnerText).
ToArray()).Trim();
}
// Extract Meta Data
nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
if (!nodes.IsNull())
{
propertyBag["Meta"].Value = (
from entry in nodes
let name = entry.Attributes["name"]
let content = entry.Attributes["content"]
where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
select name.Value + ": " + content.Value).ToArray();
}
propertyBag.Text = htmlDoc.ExtractText().Trim();
if (HasLinkStripRules || HasTextStripRules)
{
string content = StripLinks(originalContent);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
// Extract Links
DocumentWithLinks links = htmlDoc.GetLinks();
foreach (string link in links.Links.Union(links.References))
{
if (link.IsNullOrEmpty())
{
continue;
}
string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
string normalizedLink = NormalizeLink(baseUrl, decodedLink);
if (normalizedLink.IsNullOrEmpty())
{
continue;
}
crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
propertyBag.Step, new Dictionary<string, object>
//.........这里部分代码省略.........
示例2: Process
public void Process(Crawler crawler, PropertyBag propertyBag)
{
AspectF.Define.
NotNull(crawler, "crawler").
NotNull(propertyBag, "propertyBag");
string stepUri = Uri.UnescapeDataString(propertyBag.Step.Uri.AbsoluteUri);
if (stepUri.Length > 396)
{
stepUri = stepUri.Substring(0, 396);
}
var crawlHistory = AspectF.Define.
Return<CrawlHistory, NCrawlerEntitiesDbServices>(
e => e.CrawlHistory.Where(m => m.Key == stepUri).FirstOrDefault());
if (crawlHistory == null)
{
AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
{
e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", stepUri);
});
return;
}
try
{
if (propertyBag.StatusCode != HttpStatusCode.OK)
{
AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
{
e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
//CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
//if (!result.IsNull())
//{
// e.DeleteObject(result);
// e.SaveChanges();
//}
});
return;
}
if (!IsHtmlContent(propertyBag.ContentType))
{
AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
{
e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
//CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
//if (!result.IsNull())
//{
// e.DeleteObject(result);
// e.SaveChanges();
//}
});
return;
}
HtmlDocument htmlDoc = new HtmlDocument
{
OptionAddDebuggingAttributes = false,
OptionAutoCloseOnEnd = true,
OptionFixNestedTags = true,
OptionReadEncoding = true
};
using (Stream reader = propertyBag.GetResponse())
{
Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
reader.Seek(0, SeekOrigin.Begin);
if (!documentEncoding.IsNull())
{
htmlDoc.Load(reader, documentEncoding, true);
}
else
{
htmlDoc.Load(reader, true);
}
//string content = reader.ReadToEnd();
//resultHtmlContent = content;
}
//string steplUri = propertyBag.ResponseUri.OriginalString;
string orginalHtmlContent = htmlDoc.DocumentNode.OuterHtml;
string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
DocumentWithLinks links = htmlDoc.GetLinks();
//string urlRegex = @"^http://www.bbc.co.uk/food/recipes/[^#/]+$";
List<string> recipeRegex = null;
var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string;
if (jsonStr == null)
{
using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8))
{
jsonStr = stream.ReadToEnd();
var policy = new CacheItemPolicy();
policy.Priority = CacheItemPriority.NotRemovable;
policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1);
cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy);
//.........这里部分代码省略.........
示例3: Process
//.........这里部分代码省略.........
if (!documentEncoding.IsNull())
{
htmlDoc.Load(ms, documentEncoding, true);
}
else
{
htmlDoc.Load(ms, true);
}
}
string originalContent = htmlDoc.DocumentNode.OuterHtml;
if (HasTextStripRules || HasSubstitutionRules)
{
string content = StripText(originalContent);
content = Substitute(content, propertyBag.Step);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
propertyBag["HtmlDoc"].Value = htmlDoc;
HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
// Extract Title
if (!nodes.IsNull())
{
propertyBag.Title = string.Join(";", nodes.
Select(n => n.InnerText).
ToArray()).Trim();
}
// Extract Meta Data
nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
if (!nodes.IsNull())
{
propertyBag["Meta"].Value = (
from entry in nodes
let name = entry.Attributes["name"]
let content = entry.Attributes["content"]
where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
select $"{name.Value}: {content.Value}").ToArray();
}
// Extract text
propertyBag.Text = htmlDoc.ExtractText().Trim();
if (HasLinkStripRules || HasTextStripRules)
{
string content = StripLinks(originalContent);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
// Extract Head Base
nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
if (!nodes.IsNull())
{
baseUrl = nodes
.Select(entry => new {entry, href = entry.Attributes["href"]})
.Where(arg => !arg.href.IsNull()
&& !arg.href.Value.IsNullOrEmpty()
&& Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute))
.Select(t =>
{
if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative))
{
return propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value;
}
return t.href.Value;
})
.AddToEnd(baseUrl)
.FirstOrDefault();
}
// Extract Links
DocumentWithLinks links = htmlDoc.GetLinks();
foreach (string link in links.Links.Union(links.References))
{
if (link.IsNullOrEmpty())
{
continue;
}
string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
string normalizedLink = NormalizeLink(baseUrl, decodedLink);
if (normalizedLink.IsNullOrEmpty())
{
continue;
}
crawler.Crawl(new Uri(normalizedLink), propertyBag);
}
return Task.FromResult(true);
}