当前位置: 首页>>代码示例>>C#>>正文


C# HtmlDocument.DetectEncoding方法代码示例

本文整理汇总了C#中HtmlAgilityPack.HtmlDocument.DetectEncoding方法的典型用法代码示例。如果您正苦于以下问题:C# HtmlDocument.DetectEncoding方法的具体用法?C# HtmlDocument.DetectEncoding怎么用?C# HtmlDocument.DetectEncoding使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在HtmlAgilityPack.HtmlDocument的用法示例。


在下文中一共展示了HtmlDocument.DetectEncoding方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C#代码示例。

示例1: Load

        public override HtmlDocument Load(string url, uint level, string path, object userData, int tryCount)
        {
            var client = new MyWebClient();
            var data = LoadData(client, url, level, tryCount);
            if (data == null)
                return null;

            Encoding charset = null;
            var ct = client.ResponseHeaders["Content-Type"];
            var ex = new Regex("charset=(.+)");
            var match = ex.Match(ct);
            if (match.Groups.Count == 2)
            {
                charset = Encoding.GetEncoding(match.Groups[1].Value);
            }

            var web = new HtmlDocument();
            Encoding encoding = null;
            using (MemoryStream ms = new MemoryStream(data))
            {
                encoding = web.DetectEncoding(ms);
                if (encoding == null)
                    encoding = charset;
                if (encoding == null)
                    encoding = Encoding.UTF8;
            }

            if (charset != null && charset != encoding)
            {
                data = Encoding.Convert(charset, encoding, data);
            }

            using (MemoryStream ms = new MemoryStream(data))
            {
                web.Load(ms, encoding);
            }

            return web;
        }
开发者ID:john-guo,项目名称:lnE,代码行数:39,代码来源:WebDish.cs

示例2: DoPullProperties

		override protected void DoPullProperties ()
		{
			enc = null;

			try {
				foreach (Property prop in Indexable.Properties) {
					if (prop.Key != StringFu.UnindexedNamespace + "encoding")
						continue;

					enc = Encoding.GetEncoding ((string) prop.Value);
					break;
				}

				if (enc == null) {
					// we need to tell the parser to detect encoding,
					HtmlDocument temp_doc = new HtmlDocument ();
					enc = temp_doc.DetectEncoding (Stream);
					temp_doc = null;
					Stream.Seek (0, SeekOrigin.Begin);
				}
			} catch (NotSupportedException) {
				// Encoding passed in isn't supported
			}

			// Default
			if (enc == null)
				enc = Encoding.ASCII;

			doc = new HtmlDocument ();
			doc.ReportNode += HandleNodeEventHead;
			doc.StreamMode = true;
			// we already determined encoding
			doc.OptionReadEncoding = false;
	
			try {
				if (enc == null)
					doc.Load (Stream);
				else
					doc.Load (Stream, enc);
			} catch (NotSupportedException) {
				enc = Encoding.ASCII;
				doc.Load (Stream, enc);
			} catch (Exception e) {
				Log.Debug (e, "Exception while filtering HTML file " +FileInfo.FullName);
			}
		}
开发者ID:ArsenShnurkov,项目名称:beagle-1,代码行数:46,代码来源:FilterHtml.cs

示例3: Process

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {


            AspectF.Define.
                NotNull(crawler, "crawler").
                NotNull(propertyBag, "propertyBag");

            string stepUri = Uri.UnescapeDataString(propertyBag.Step.Uri.AbsoluteUri);
            if (stepUri.Length > 396)
            {
                stepUri = stepUri.Substring(0, 396);
            }
            var crawlHistory = AspectF.Define.
               Return<CrawlHistory, NCrawlerEntitiesDbServices>(
                   e => e.CrawlHistory.Where(m => m.Key == stepUri).FirstOrDefault());

            if (crawlHistory == null)
            {
                AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                {
                    e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", stepUri);
                });
                return;
            }
            try
            {
                if (propertyBag.StatusCode != HttpStatusCode.OK)
                {
                    AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                    {
                        e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                        //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
                        //if (!result.IsNull())
                        //{
                        //    e.DeleteObject(result);
                        //    e.SaveChanges();
                        //}
                    });
                    return;
                }

                if (!IsHtmlContent(propertyBag.ContentType))
                {
                    AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                    {
                        e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                        //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
                        //if (!result.IsNull())
                        //{
                        //    e.DeleteObject(result);
                        //    e.SaveChanges();
                        //}
                    });
                    return;
                }
                HtmlDocument htmlDoc = new HtmlDocument
                {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd = true,
                    OptionFixNestedTags = true,
                    OptionReadEncoding = true
                };
                using (Stream reader = propertyBag.GetResponse())
                {
                    Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                    reader.Seek(0, SeekOrigin.Begin);
                    if (!documentEncoding.IsNull())
                    {
                        htmlDoc.Load(reader, documentEncoding, true);
                    }
                    else
                    {
                        htmlDoc.Load(reader, true);
                    }

                    //string content = reader.ReadToEnd();
                    //resultHtmlContent = content;
                }
                //string steplUri = propertyBag.ResponseUri.OriginalString;


                string orginalHtmlContent = htmlDoc.DocumentNode.OuterHtml;
                string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                DocumentWithLinks links = htmlDoc.GetLinks();



                //string urlRegex = @"^http://www.bbc.co.uk/food/recipes/[^#/]+$";
                List<string> recipeRegex = null;
                var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string;
                if (jsonStr == null)
                {
                    using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8))
                    {
                        jsonStr = stream.ReadToEnd();
                        var policy = new CacheItemPolicy();
                        policy.Priority = CacheItemPriority.NotRemovable;
                        policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1);
                        cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy);
//.........这里部分代码省略.........
开发者ID:GBmono,项目名称:GBmonoV1.0,代码行数:101,代码来源:WholeHtmlProcessor.cs

示例4: Process

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
                NotNull(crawler, "crawler").
                NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return;
            }

            HtmlDocument htmlDoc = new HtmlDocument
                {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd = true,
                    OptionFixNestedTags = true,
                    OptionReadEncoding = true
                };
            using (Stream reader = propertyBag.GetResponse())
            {
                Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                reader.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(reader, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(reader, true);
                }
            }

            string originalContent = htmlDoc.DocumentNode.OuterHtml;
            if (HasTextStripRules || HasSubstitutionRules)
            {
                string content = StripText(originalContent);
                content = Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            propertyBag["HtmlDoc"].Value = htmlDoc;

            HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                    Select(n => n.InnerText).
                    ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                    let content = entry.Attributes["content"]
                    where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                    select name.Value + ": " + content.Value).ToArray();
            }

            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (HasLinkStripRules || HasTextStripRules)
            {
                string content = StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            // Extract Links
            DocumentWithLinks links = htmlDoc.GetLinks();
            foreach (string link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                    propertyBag.Step, new Dictionary<string, object>
//.........这里部分代码省略.........
开发者ID:fzhenmei,项目名称:study,代码行数:101,代码来源:HtmlDocumentProcessor.cs

示例5: ParseArticle

        private NewsItem ParseArticle(string url, string source)
        {
            var uri = new Uri(url);
            var manager = GetManager(source);
            var doc = new HtmlDocument();

            CustomWebClient client = new CustomWebClient();

            using (var stream = new StreamReader(client.OpenRead(url)))
            {

                var encoding = doc.DetectEncoding(stream);
                using (var stream2 = client.OpenRead(url))
                {
                    if (encoding == null)
                        encoding = Encoding.UTF8;
                    doc.Load(stream2, encoding);
                }
            }
            var item = manager.ParseItem(doc, uri.Host);
            item.Href = client.ResponseUri.ToString();
            return item;
        }
开发者ID:bonchovylkov,项目名称:Sport-Classifier,代码行数:23,代码来源:CrowlingService.cs

示例6: Process

		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			AspectF.Define
				.NotNull(crawler, nameof(crawler))
				.NotNull(propertyBag, nameof(propertyBag));

			if (propertyBag.StatusCode != HttpStatusCode.OK)
			{
				return Task.FromResult(true);
			}

			if (!IsHtmlContent(propertyBag.ContentType))
			{
				return Task.FromResult(true);
			}

			HtmlDocument htmlDoc = new HtmlDocument
			{
				OptionAddDebuggingAttributes = false,
				OptionAutoCloseOnEnd = true,
				OptionFixNestedTags = true,
				OptionReadEncoding = true
			};

			using (MemoryStream ms = new MemoryStream(propertyBag.Response))
			{
				Encoding documentEncoding = htmlDoc.DetectEncoding(ms);
				ms.Seek(0, SeekOrigin.Begin);
				if (!documentEncoding.IsNull())
				{
					htmlDoc.Load(ms, documentEncoding, true);
				}
				else
				{
					htmlDoc.Load(ms, true);
				}
			}

			string originalContent = htmlDoc.DocumentNode.OuterHtml;
			if (HasTextStripRules || HasSubstitutionRules)
			{
				string content = StripText(originalContent);
				content = Substitute(content, propertyBag.Step);
				using (TextReader tr = new StringReader(content))
				{
					htmlDoc.Load(tr);
				}
			}

			propertyBag["HtmlDoc"].Value = htmlDoc;

			HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
			// Extract Title
			if (!nodes.IsNull())
			{
				propertyBag.Title = string.Join(";", nodes.
					Select(n => n.InnerText).
					ToArray()).Trim();
			}

			// Extract Meta Data
			nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
			if (!nodes.IsNull())
			{
				propertyBag["Meta"].Value = (
					from entry in nodes
					let name = entry.Attributes["name"]
					let content = entry.Attributes["content"]
					where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
					select $"{name.Value}: {content.Value}").ToArray();
			}

			// Extract text
			propertyBag.Text = htmlDoc.ExtractText().Trim();
			if (HasLinkStripRules || HasTextStripRules)
			{
				string content = StripLinks(originalContent);
				using (TextReader tr = new StringReader(content))
				{
					htmlDoc.Load(tr);
				}
			}

			string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);

			// Extract Head Base
			nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
			if (!nodes.IsNull())
			{
				baseUrl = nodes
					.Select(entry => new {entry, href = entry.Attributes["href"]})
					.Where(arg => !arg.href.IsNull()
						&& !arg.href.Value.IsNullOrEmpty()
						&& Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute))
					.Select(t =>
					{
						if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative))
						{
							return propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value;
						}
//.........这里部分代码省略.........
开发者ID:esbencarlsen,项目名称:NCrawler,代码行数:101,代码来源:HtmlDocumentProcessorPipelineStep.cs

示例7: ParseKartaitogovWebPage

        public void ParseKartaitogovWebPage()
        {
            var viewModel = new KartaitogovViewModel(new Logger());
            //string resourceName = "Loader.Tests.diff.htm";
            //Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName);
            var filePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Kartaitogov", "diff.htm");
            //byte[] webpageContent = Encoding.UTF8.GetBytes(File.ReadAllText(filePath));

            /*
            viewModel.Downloader = new MockDownloader(webpageContent);

            var task = viewModel.downloadImages();
            Assert.IsTrue(task.Wait(TimeSpan.FromSeconds(10)));
            Assert.IsNull(viewModel.LastError, "Error occured: " + viewModel.LastError);

            */

            HtmlDocument htmlDoc = new HtmlDocument();
            Encoding encoding = htmlDoc.DetectEncoding(filePath) ?? Encoding.UTF8;
            htmlDoc.Load(filePath, encoding);

            var reUikNumber = new System.Text.RegularExpressions.Regex(@"\d+");
            using (var con = new SqlConnection("Data Source=.;Initial Catalog = elect;Integrated Security=True"))
            {
                con.Open();
                var cmdRegion = con.CreateCommand();
                cmdRegion.CommandText = "select ObjectID from Region where name = @pName";
                var sqlParamRegName = new SqlParameter("pName", SqlDbType.VarChar);
                cmdRegion.Parameters.Add(sqlParamRegName);

                var cmdComission = con.CreateCommand();
                cmdComission.CommandText = "select ObjectID from Comission where Region = @pRegion and [Number] = @pNumber";
                var sqlParamComNum = new SqlParameter("pNumber", SqlDbType.Int);
                cmdComission.Parameters.Add(sqlParamComNum);
                var sqlParamRegId = new SqlParameter("pRegion", SqlDbType.UniqueIdentifier);
                cmdComission.Parameters.Add(sqlParamRegId);

                string regionName = null;
                Guid regionId = Guid.Empty;
                foreach (HtmlNode headUik in htmlDoc.DocumentNode.SelectNodes("//h3[@class='uik']"))
                {
                    var regionNode = headUik.SelectSingleNode("preceding-sibling::h2[@class='oblast']");
                    var uikText = headUik.InnerText;
                    if (regionNode != null)
                    {
                        var match = reUikNumber.Match(uikText);
                        if (!match.Success)
                        {
                            Console.WriteLine("ERROR: Can't parse UIK number: " + uikText);
                        }
                        else
                        {
                            if (regionName != regionNode.InnerText)
                            {
                                regionName = regionNode.InnerText;
                                sqlParamRegName.Value = regionName;
                                var regionIdRaw = cmdRegion.ExecuteScalar();
                                if (regionIdRaw != null)
                                    regionId = (Guid)regionIdRaw;
                                else
                                {
                                    regionId = Guid.Empty;
                                    Console.WriteLine("WARN: Can't find in DB a region with name: " + regionName);
                                }
                            }

                            sqlParamRegId.Value = regionId;
                            int comissionNum = Int32.Parse(match.Value);
                            sqlParamComNum.Value = comissionNum;
                            var comissionIdRaw = cmdComission.ExecuteScalar();
                            Guid comissionId;
                            if (comissionIdRaw != null)
                                comissionId = (Guid)comissionIdRaw;
                            else
                                comissionId = Guid.Empty;
                            //Console.WriteLine(regionNode.InnerText + " : " + uikText.Substring(uikText.IndexOf('\n', 0, 2)));
                            Console.WriteLine(regionName + "(" + regionId + ")" + " / " + comissionNum + "(" + comissionId + ")");
                        }
                    }
                    else
                    {
                        Console.WriteLine("ERROR: Can't find region node!");
                    }
                }
            }
        }
开发者ID:evil-shrike,项目名称:RuElect,代码行数:86,代码来源:UnitTest1.cs


注:本文中的HtmlAgilityPack.HtmlDocument.DetectEncoding方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。