本文整理汇总了C#中HtmlAgilityPack.HtmlDocument.ExtractText方法的典型用法代码示例。如果您正苦于以下问题:C# HtmlDocument.ExtractText方法的具体用法?C# HtmlDocument.ExtractText怎么用?C# HtmlDocument.ExtractText使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类HtmlAgilityPack.HtmlDocument
的用法示例。
在下文中一共展示了HtmlDocument.ExtractText方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C#代码示例。
示例1: Process
public void Process(Crawler crawler, PropertyBag propertyBag)
{
AspectF.Define.
NotNull(crawler, "crawler").
NotNull(propertyBag, "propertyBag");
if (propertyBag.StatusCode != HttpStatusCode.OK)
{
return;
}
if (!IsHtmlContent(propertyBag.ContentType))
{
return;
}
HtmlDocument htmlDoc = new HtmlDocument
{
OptionAddDebuggingAttributes = false,
OptionAutoCloseOnEnd = true,
OptionFixNestedTags = true,
OptionReadEncoding = true
};
using (Stream reader = propertyBag.GetResponse())
{
Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
reader.Seek(0, SeekOrigin.Begin);
if (!documentEncoding.IsNull())
{
htmlDoc.Load(reader, documentEncoding, true);
}
else
{
htmlDoc.Load(reader, true);
}
}
string originalContent = htmlDoc.DocumentNode.OuterHtml;
if (HasTextStripRules || HasSubstitutionRules)
{
string content = StripText(originalContent);
content = Substitute(content, propertyBag.Step);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
propertyBag["HtmlDoc"].Value = htmlDoc;
HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
// Extract Title
if (!nodes.IsNull())
{
propertyBag.Title = string.Join(";", nodes.
Select(n => n.InnerText).
ToArray()).Trim();
}
// Extract Meta Data
nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
if (!nodes.IsNull())
{
propertyBag["Meta"].Value = (
from entry in nodes
let name = entry.Attributes["name"]
let content = entry.Attributes["content"]
where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
select name.Value + ": " + content.Value).ToArray();
}
propertyBag.Text = htmlDoc.ExtractText().Trim();
if (HasLinkStripRules || HasTextStripRules)
{
string content = StripLinks(originalContent);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
// Extract Links
DocumentWithLinks links = htmlDoc.GetLinks();
foreach (string link in links.Links.Union(links.References))
{
if (link.IsNullOrEmpty())
{
continue;
}
string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
string normalizedLink = NormalizeLink(baseUrl, decodedLink);
if (normalizedLink.IsNullOrEmpty())
{
continue;
}
crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
propertyBag.Step, new Dictionary<string, object>
//.........这里部分代码省略.........
示例2: Process
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
{
AspectF.Define
.NotNull(crawler, nameof(crawler))
.NotNull(propertyBag, nameof(propertyBag));
if (propertyBag.StatusCode != HttpStatusCode.OK)
{
return Task.FromResult(true);
}
if (!IsHtmlContent(propertyBag.ContentType))
{
return Task.FromResult(true);
}
HtmlDocument htmlDoc = new HtmlDocument
{
OptionAddDebuggingAttributes = false,
OptionAutoCloseOnEnd = true,
OptionFixNestedTags = true,
OptionReadEncoding = true
};
using (MemoryStream ms = new MemoryStream(propertyBag.Response))
{
Encoding documentEncoding = htmlDoc.DetectEncoding(ms);
ms.Seek(0, SeekOrigin.Begin);
if (!documentEncoding.IsNull())
{
htmlDoc.Load(ms, documentEncoding, true);
}
else
{
htmlDoc.Load(ms, true);
}
}
string originalContent = htmlDoc.DocumentNode.OuterHtml;
if (HasTextStripRules || HasSubstitutionRules)
{
string content = StripText(originalContent);
content = Substitute(content, propertyBag.Step);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
propertyBag["HtmlDoc"].Value = htmlDoc;
HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
// Extract Title
if (!nodes.IsNull())
{
propertyBag.Title = string.Join(";", nodes.
Select(n => n.InnerText).
ToArray()).Trim();
}
// Extract Meta Data
nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
if (!nodes.IsNull())
{
propertyBag["Meta"].Value = (
from entry in nodes
let name = entry.Attributes["name"]
let content = entry.Attributes["content"]
where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
select $"{name.Value}: {content.Value}").ToArray();
}
// Extract text
propertyBag.Text = htmlDoc.ExtractText().Trim();
if (HasLinkStripRules || HasTextStripRules)
{
string content = StripLinks(originalContent);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
// Extract Head Base
nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
if (!nodes.IsNull())
{
baseUrl = nodes
.Select(entry => new {entry, href = entry.Attributes["href"]})
.Where(arg => !arg.href.IsNull()
&& !arg.href.Value.IsNullOrEmpty()
&& Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute))
.Select(t =>
{
if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative))
{
return propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value;
}
//.........这里部分代码省略.........