本文整理汇总了C#中HtmlAgilityPack.HtmlDocument.DetectEncoding方法的典型用法代码示例。如果您正苦于以下问题:C# HtmlDocument.DetectEncoding方法的具体用法?C# HtmlDocument.DetectEncoding怎么用?C# HtmlDocument.DetectEncoding使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类HtmlAgilityPack.HtmlDocument
的用法示例。
在下文中一共展示了HtmlDocument.DetectEncoding方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C#代码示例。
示例1: Load
public override HtmlDocument Load(string url, uint level, string path, object userData, int tryCount)
{
var client = new MyWebClient();
var data = LoadData(client, url, level, tryCount);
if (data == null)
return null;
Encoding charset = null;
var ct = client.ResponseHeaders["Content-Type"];
var ex = new Regex("charset=(.+)");
var match = ex.Match(ct);
if (match.Groups.Count == 2)
{
charset = Encoding.GetEncoding(match.Groups[1].Value);
}
var web = new HtmlDocument();
Encoding encoding = null;
using (MemoryStream ms = new MemoryStream(data))
{
encoding = web.DetectEncoding(ms);
if (encoding == null)
encoding = charset;
if (encoding == null)
encoding = Encoding.UTF8;
}
if (charset != null && charset != encoding)
{
data = Encoding.Convert(charset, encoding, data);
}
using (MemoryStream ms = new MemoryStream(data))
{
web.Load(ms, encoding);
}
return web;
}
示例2: DoPullProperties
override protected void DoPullProperties ()
{
enc = null;
try {
foreach (Property prop in Indexable.Properties) {
if (prop.Key != StringFu.UnindexedNamespace + "encoding")
continue;
enc = Encoding.GetEncoding ((string) prop.Value);
break;
}
if (enc == null) {
// we need to tell the parser to detect encoding,
HtmlDocument temp_doc = new HtmlDocument ();
enc = temp_doc.DetectEncoding (Stream);
temp_doc = null;
Stream.Seek (0, SeekOrigin.Begin);
}
} catch (NotSupportedException) {
// Encoding passed in isn't supported
}
// Default
if (enc == null)
enc = Encoding.ASCII;
doc = new HtmlDocument ();
doc.ReportNode += HandleNodeEventHead;
doc.StreamMode = true;
// we already determined encoding
doc.OptionReadEncoding = false;
try {
if (enc == null)
doc.Load (Stream);
else
doc.Load (Stream, enc);
} catch (NotSupportedException) {
enc = Encoding.ASCII;
doc.Load (Stream, enc);
} catch (Exception e) {
Log.Debug (e, "Exception while filtering HTML file " +FileInfo.FullName);
}
}
示例3: Process
public void Process(Crawler crawler, PropertyBag propertyBag)
{
AspectF.Define.
NotNull(crawler, "crawler").
NotNull(propertyBag, "propertyBag");
string stepUri = Uri.UnescapeDataString(propertyBag.Step.Uri.AbsoluteUri);
if (stepUri.Length > 396)
{
stepUri = stepUri.Substring(0, 396);
}
var crawlHistory = AspectF.Define.
Return<CrawlHistory, NCrawlerEntitiesDbServices>(
e => e.CrawlHistory.Where(m => m.Key == stepUri).FirstOrDefault());
if (crawlHistory == null)
{
AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
{
e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", stepUri);
});
return;
}
try
{
if (propertyBag.StatusCode != HttpStatusCode.OK)
{
AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
{
e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
//CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
//if (!result.IsNull())
//{
// e.DeleteObject(result);
// e.SaveChanges();
//}
});
return;
}
if (!IsHtmlContent(propertyBag.ContentType))
{
AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
{
e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
//CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
//if (!result.IsNull())
//{
// e.DeleteObject(result);
// e.SaveChanges();
//}
});
return;
}
HtmlDocument htmlDoc = new HtmlDocument
{
OptionAddDebuggingAttributes = false,
OptionAutoCloseOnEnd = true,
OptionFixNestedTags = true,
OptionReadEncoding = true
};
using (Stream reader = propertyBag.GetResponse())
{
Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
reader.Seek(0, SeekOrigin.Begin);
if (!documentEncoding.IsNull())
{
htmlDoc.Load(reader, documentEncoding, true);
}
else
{
htmlDoc.Load(reader, true);
}
//string content = reader.ReadToEnd();
//resultHtmlContent = content;
}
//string steplUri = propertyBag.ResponseUri.OriginalString;
string orginalHtmlContent = htmlDoc.DocumentNode.OuterHtml;
string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
DocumentWithLinks links = htmlDoc.GetLinks();
//string urlRegex = @"^http://www.bbc.co.uk/food/recipes/[^#/]+$";
List<string> recipeRegex = null;
var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string;
if (jsonStr == null)
{
using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8))
{
jsonStr = stream.ReadToEnd();
var policy = new CacheItemPolicy();
policy.Priority = CacheItemPriority.NotRemovable;
policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1);
cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy);
//.........这里部分代码省略.........
示例4: Process
public void Process(Crawler crawler, PropertyBag propertyBag)
{
AspectF.Define.
NotNull(crawler, "crawler").
NotNull(propertyBag, "propertyBag");
if (propertyBag.StatusCode != HttpStatusCode.OK)
{
return;
}
if (!IsHtmlContent(propertyBag.ContentType))
{
return;
}
HtmlDocument htmlDoc = new HtmlDocument
{
OptionAddDebuggingAttributes = false,
OptionAutoCloseOnEnd = true,
OptionFixNestedTags = true,
OptionReadEncoding = true
};
using (Stream reader = propertyBag.GetResponse())
{
Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
reader.Seek(0, SeekOrigin.Begin);
if (!documentEncoding.IsNull())
{
htmlDoc.Load(reader, documentEncoding, true);
}
else
{
htmlDoc.Load(reader, true);
}
}
string originalContent = htmlDoc.DocumentNode.OuterHtml;
if (HasTextStripRules || HasSubstitutionRules)
{
string content = StripText(originalContent);
content = Substitute(content, propertyBag.Step);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
propertyBag["HtmlDoc"].Value = htmlDoc;
HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
// Extract Title
if (!nodes.IsNull())
{
propertyBag.Title = string.Join(";", nodes.
Select(n => n.InnerText).
ToArray()).Trim();
}
// Extract Meta Data
nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
if (!nodes.IsNull())
{
propertyBag["Meta"].Value = (
from entry in nodes
let name = entry.Attributes["name"]
let content = entry.Attributes["content"]
where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
select name.Value + ": " + content.Value).ToArray();
}
propertyBag.Text = htmlDoc.ExtractText().Trim();
if (HasLinkStripRules || HasTextStripRules)
{
string content = StripLinks(originalContent);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
// Extract Links
DocumentWithLinks links = htmlDoc.GetLinks();
foreach (string link in links.Links.Union(links.References))
{
if (link.IsNullOrEmpty())
{
continue;
}
string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
string normalizedLink = NormalizeLink(baseUrl, decodedLink);
if (normalizedLink.IsNullOrEmpty())
{
continue;
}
crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
propertyBag.Step, new Dictionary<string, object>
//.........这里部分代码省略.........
示例5: ParseArticle
private NewsItem ParseArticle(string url, string source)
{
var uri = new Uri(url);
var manager = GetManager(source);
var doc = new HtmlDocument();
CustomWebClient client = new CustomWebClient();
using (var stream = new StreamReader(client.OpenRead(url)))
{
var encoding = doc.DetectEncoding(stream);
using (var stream2 = client.OpenRead(url))
{
if (encoding == null)
encoding = Encoding.UTF8;
doc.Load(stream2, encoding);
}
}
var item = manager.ParseItem(doc, uri.Host);
item.Href = client.ResponseUri.ToString();
return item;
}
示例6: Process
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
{
AspectF.Define
.NotNull(crawler, nameof(crawler))
.NotNull(propertyBag, nameof(propertyBag));
if (propertyBag.StatusCode != HttpStatusCode.OK)
{
return Task.FromResult(true);
}
if (!IsHtmlContent(propertyBag.ContentType))
{
return Task.FromResult(true);
}
HtmlDocument htmlDoc = new HtmlDocument
{
OptionAddDebuggingAttributes = false,
OptionAutoCloseOnEnd = true,
OptionFixNestedTags = true,
OptionReadEncoding = true
};
using (MemoryStream ms = new MemoryStream(propertyBag.Response))
{
Encoding documentEncoding = htmlDoc.DetectEncoding(ms);
ms.Seek(0, SeekOrigin.Begin);
if (!documentEncoding.IsNull())
{
htmlDoc.Load(ms, documentEncoding, true);
}
else
{
htmlDoc.Load(ms, true);
}
}
string originalContent = htmlDoc.DocumentNode.OuterHtml;
if (HasTextStripRules || HasSubstitutionRules)
{
string content = StripText(originalContent);
content = Substitute(content, propertyBag.Step);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
propertyBag["HtmlDoc"].Value = htmlDoc;
HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
// Extract Title
if (!nodes.IsNull())
{
propertyBag.Title = string.Join(";", nodes.
Select(n => n.InnerText).
ToArray()).Trim();
}
// Extract Meta Data
nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
if (!nodes.IsNull())
{
propertyBag["Meta"].Value = (
from entry in nodes
let name = entry.Attributes["name"]
let content = entry.Attributes["content"]
where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
select $"{name.Value}: {content.Value}").ToArray();
}
// Extract text
propertyBag.Text = htmlDoc.ExtractText().Trim();
if (HasLinkStripRules || HasTextStripRules)
{
string content = StripLinks(originalContent);
using (TextReader tr = new StringReader(content))
{
htmlDoc.Load(tr);
}
}
string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
// Extract Head Base
nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
if (!nodes.IsNull())
{
baseUrl = nodes
.Select(entry => new {entry, href = entry.Attributes["href"]})
.Where(arg => !arg.href.IsNull()
&& !arg.href.Value.IsNullOrEmpty()
&& Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute))
.Select(t =>
{
if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative))
{
return propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value;
}
//.........这里部分代码省略.........
示例7: ParseKartaitogovWebPage
public void ParseKartaitogovWebPage()
{
var viewModel = new KartaitogovViewModel(new Logger());
//string resourceName = "Loader.Tests.diff.htm";
//Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName);
var filePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Kartaitogov", "diff.htm");
//byte[] webpageContent = Encoding.UTF8.GetBytes(File.ReadAllText(filePath));
/*
viewModel.Downloader = new MockDownloader(webpageContent);
var task = viewModel.downloadImages();
Assert.IsTrue(task.Wait(TimeSpan.FromSeconds(10)));
Assert.IsNull(viewModel.LastError, "Error occured: " + viewModel.LastError);
*/
HtmlDocument htmlDoc = new HtmlDocument();
Encoding encoding = htmlDoc.DetectEncoding(filePath) ?? Encoding.UTF8;
htmlDoc.Load(filePath, encoding);
var reUikNumber = new System.Text.RegularExpressions.Regex(@"\d+");
using (var con = new SqlConnection("Data Source=.;Initial Catalog = elect;Integrated Security=True"))
{
con.Open();
var cmdRegion = con.CreateCommand();
cmdRegion.CommandText = "select ObjectID from Region where name = @pName";
var sqlParamRegName = new SqlParameter("pName", SqlDbType.VarChar);
cmdRegion.Parameters.Add(sqlParamRegName);
var cmdComission = con.CreateCommand();
cmdComission.CommandText = "select ObjectID from Comission where Region = @pRegion and [Number] = @pNumber";
var sqlParamComNum = new SqlParameter("pNumber", SqlDbType.Int);
cmdComission.Parameters.Add(sqlParamComNum);
var sqlParamRegId = new SqlParameter("pRegion", SqlDbType.UniqueIdentifier);
cmdComission.Parameters.Add(sqlParamRegId);
string regionName = null;
Guid regionId = Guid.Empty;
foreach (HtmlNode headUik in htmlDoc.DocumentNode.SelectNodes("//h3[@class='uik']"))
{
var regionNode = headUik.SelectSingleNode("preceding-sibling::h2[@class='oblast']");
var uikText = headUik.InnerText;
if (regionNode != null)
{
var match = reUikNumber.Match(uikText);
if (!match.Success)
{
Console.WriteLine("ERROR: Can't parse UIK number: " + uikText);
}
else
{
if (regionName != regionNode.InnerText)
{
regionName = regionNode.InnerText;
sqlParamRegName.Value = regionName;
var regionIdRaw = cmdRegion.ExecuteScalar();
if (regionIdRaw != null)
regionId = (Guid)regionIdRaw;
else
{
regionId = Guid.Empty;
Console.WriteLine("WARN: Can't find in DB a region with name: " + regionName);
}
}
sqlParamRegId.Value = regionId;
int comissionNum = Int32.Parse(match.Value);
sqlParamComNum.Value = comissionNum;
var comissionIdRaw = cmdComission.ExecuteScalar();
Guid comissionId;
if (comissionIdRaw != null)
comissionId = (Guid)comissionIdRaw;
else
comissionId = Guid.Empty;
//Console.WriteLine(regionNode.InnerText + " : " + uikText.Substring(uikText.IndexOf('\n', 0, 2)));
Console.WriteLine(regionName + "(" + regionId + ")" + " / " + comissionNum + "(" + comissionId + ")");
}
}
else
{
Console.WriteLine("ERROR: Can't find region node!");
}
}
}
}