Golang Page.GetHtmlParser方法代码示例

本文整理汇总了Golang中github.com/hu17889/go_spider/core/common/page.Page.GetHtmlParser方法的典型用法代码示例。如果您正苦于以下问题：Golang Page.GetHtmlParser方法的具体用法？Golang Page.GetHtmlParser怎么用？Golang Page.GetHtmlParser使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类github.com/hu17889/go_spider/core/common/page.Page的用法示例。

在下文中一共展示了Page.GetHtmlParser方法的11个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Golang代码示例。

示例1: Process

// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}

	query := p.GetHtmlParser()
	var urls []string
	query.Find("h3[class='repo-list-name'] a").Each(func(i int, s *goquery.Selection) {
		href, _ := s.Attr("href")
		urls = append(urls, "http://github.com/"+href)
	})
	// these urls will be saved and crawed by other coroutines.
	p.AddTargetRequests(urls, "html")

	name := query.Find(".entry-title .author").Text()
	name = strings.Trim(name, " \t\n")
	repository := query.Find(".entry-title .js-current-repository").Text()
	repository = strings.Trim(repository, " \t\n")
	//readme, _ := query.Find("#readme").Html()
	if name == "" {
		p.SetSkip(true)
	}
	// the entity we want to save by Pipeline
	p.AddField("author", name)
	p.AddField("project", repository)
	//p.AddField("readme", readme)
}

开发者ID:xujb，项目名称:go_spider，代码行数:30，代码来源:main.go

示例2: Process

// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
	query := p.GetHtmlParser()
	var urls []string
	query.Find("#threadlisttableid tbody").Each(func(i int, s *goquery.Selection) {
		if s.HasClass("emptb") {
			return
		}
		href, _ := s.Find("tbody tr .icn a").Attr("href")
		urls = append(urls, href)
	})

	// these urls will be saved and crawed by other coroutines.
	p.AddTargetRequests(urls, "html")

	title := query.Find("#thread_subject").Text()
	title = strings.Trim(title, "\t\n\r")
	author := query.Find("#postlist div .authi").Eq(0).Text()
	author = strings.Trim(author, "\t\r\n")

	if title == "" || author == "" {
		p.SetSkip(true)
	}

	p.AddField("title", title)
	p.AddField("author", author)
}

开发者ID:tuyuwei，项目名称:test，代码行数:28，代码来源:main.go

示例3: Process

// Parse html dom here and record the parse result that we want to crawl.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
	query := p.GetHtmlParser()

	name := query.Find(".lemmaTitleH1").Text()
	name = strings.Trim(name, " \t\n")

	summary := query.Find(".card-summary-content .para").Text()
	summary = strings.Trim(summary, " \t\n")

	// the entity we want to save by Pipeline
	p.AddField("name", name)
	p.AddField("summary", summary)
}

开发者ID:w3hacker，项目名称:go_spider，代码行数:15，代码来源:main.go

示例4: Process

func (this *PlantProcesser) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}
	query := p.GetHtmlParser()

	if !this.isPlant(query, p) {
		p.SetSkip(true)
	}

	this.getName(query, p)
	this.getSummary(query, p)
	this.getCatalog(query, p)
	p.AddTargetRequests(this.getUrls(query), "html")
}

开发者ID:liulnn，项目名称:plant-spider，代码行数:16，代码来源:processer.go

示例5: Process

// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}
	query := p.GetHtmlParser()
	currentUrl := p.GetRequest().GetUrl()
	var urls []string
	query.Find("a").Each(func(i int, s *goquery.Selection) {
		href, _ := s.Attr("href")
		urlHref, err := url.Parse(href)
		if err != nil {
			mlog.LogInst().LogError(err.Error())
			return
		}
		if !urlHref.IsAbs() {
			href = currentUrl + href
		}
		// Temporarily check in crawler.go, it will be implemented in pattern package.

		if checkMatchPattern(base, href) {
			visited, _ := rep.CheckIfVisited(href)
			if !visited {
				rep.VisitedNewNode(href)
				// urls = append(urls, href)
				urlstr.UploadURL(href)
			}
		}
	})

	// store content to db

	fmt.Printf("====store & commit : %s====\n\n\n", currentUrl)
	content, _ := query.Html()
	// content := ""
	storage.StoreInsert(collection, storage.StoreFormat{currentUrl, content})
	urlstr.CommitURL(currentUrl)
	releaseSlot <- 1

	url := GetOneURL()
	if url != "" {
		urls = append(urls, url)
	}

	p.AddTargetRequests(urls, "html")

}

开发者ID:plutoshe，项目名称:webCrawler，代码行数:49，代码来源:crawler.go

示例6: Process

func (this *MyProcessor) Process(p *page.Page) {
	if !p.IsSucc() {
		mlog.LogInst().LogError(p.Errormsg())
		return
	}

	u, err := url.Parse(p.GetRequest().GetUrl())
	if err != nil {
		mlog.LogInst().LogError(err.Error())
		return
	}
	if !strings.HasSuffix(u.Host, "jiexieyin.org") {
		return
	}

	var urls []string
	query := p.GetHtmlParser()

	query.Find("a").Each(func(i int, s *goquery.Selection) {
		href, _ := s.Attr("href")
		reJavascript := regexp.MustCompile("^javascript\\:")
		reLocal := regexp.MustCompile("^\\#")
		reMailto := regexp.MustCompile("^mailto\\:")
		if reJavascript.MatchString(href) || reLocal.MatchString(href) || reMailto.MatchString(href) {
			return
		}

		//处理相对路径
		var absHref string
		urlHref, err := url.Parse(href)
		if err != nil {
			mlog.LogInst().LogError(err.Error())
			return
		}
		if !urlHref.IsAbs() {
			urlPrefix := p.GetRequest().GetUrl()
			absHref = urlPrefix + href
			urls = append(urls, absHref)
		} else {
			urls = append(urls, href)
		}

	})

	p.AddTargetRequests(urls, "html")

}

开发者ID:wadee，项目名称:go_proj，代码行数:47，代码来源:main.go

示例7: TestDownloadHtml

func TestDownloadHtml(t *testing.T) {
	//return
	//request := request.NewRequest("http://live.sina.com.cn/zt/api/l/get/finance/globalnews1/index.htm?format=json&callback=t13975294&id=23521&pagesize=45&dire=f&dpc=1")
	var req *request.Request
	req = request.NewRequest("http://live.sina.com.cn/zt/l/v/finance/globalnews1/", "html", "", "GET", "", nil, nil, nil, nil)

	var dl downloader.Downloader
	dl = downloader.NewHttpDownloader()

	var p *page.Page
	p = dl.Download(req)

	var doc *goquery.Document
	doc = p.GetHtmlParser()
	//fmt.Println(doc)
	//body := p.GetBodyStr()
	//fmt.Println(body)

	var s *goquery.Selection
	s = doc.Find("body")
	if s.Length() < 1 {
		t.Error("html parse failed!")
	}

	/*
	   doc, err := goquery.NewDocument("http://live.sina.com.cn/zt/l/v/finance/globalnews1/")
	   if err != nil {
	       fmt.Printf("%v",err)
	   }
	   s := doc.Find("meta");
	   fmt.Println(s.Length())

	   resp, err := http.Get("http://live.sina.com.cn/zt/l/v/finance/globalnews1/")
	   if err != nil {
	       fmt.Printf("%v",err)
	   }
	   defer resp.Body.Close()
	   doc, err = goquery.NewDocumentFromReader(resp.Body)
	   s = doc.Find("meta");
	   fmt.Println(s.Length())
	*/
}

开发者ID:CrocdileChan，项目名称:go_spider，代码行数:42，代码来源:downloader_test.go

示例8: Process

/*
 ** 解析页面，把粉丝的信息存入dynamodb，同时把接下来要爬取的url存入sqs
 */
func (this *MyPageProcesser) Process(p *page.Page) {
	if !p.IsSucc() {
		glog.Errorln(p.Errormsg())
		return
	}
	/*
	 ** 打印爬取得页面
	 */
	glog.Infoln(p)
	query := p.GetHtmlParser()

	if Urls[i] == "weibo.cn" {
		i = i + 1
	}

	if UrlsLevel[i] == 0 {
		glog.Infoln("layer:", crawlUrl.Layer)
		this.w.GetNextPageUrl(query, p)
		this.w.GetFriendsUrl(query, p)
	} else if UrlsLevel[i] == 1 {
		this.w.GetFriendsInfo(query)
	}
	// if crawlUrl.Layer == 0 {
	// } else if crawlUrl.Layer == 1 {
	// 	glog.Infoln("layer:", crawlUrl.Layer)
	// 	this.w.GetNextPageUrl(query, p)
	// 	this.w.GetFFUrl(query)
	// } else if crawlUrl.Layer == 2 {
	// 	glog.Infoln("layer:", crawlUrl.Layer)
	// 	this.w.GetFFInfo(query)
	// }
	//

	header_num := rand.Intn(9)
	header_json := headerJson[header_num]
	i = i + 1
	p.AddTargetRequestWithHeaderFile(Urls[i], "html", header_json)

}

开发者ID:luzh0422，项目名称:spider-docker，代码行数:42，代码来源:spider.go

示例9: Process

func (this MyPageProcesser) Process(p *page.Page) {
	query := p.GetHtmlParser()

	if p.GetUrlTag() == "index" {
		query.Find(`div[class="main area"] div[class="lc"] ul li a`).Each(func(i int, s *goquery.Selection) {
			url, isExsit := s.Attr("href")
			if isExsit {
				reg := regexp.MustCompile(`^do not know what is this`)
				var fmtStr string
				if rxYule.MatchString(url) {
					reg = rxYule
					fmtStr = wkSohuYule
				}

				if rxPic.MatchString(url) {
					reg = rxPic
					fmtStr = wkSohuPic
				}

				regxpArrag := reg.FindStringSubmatch(url)
				if len(regxpArrag) == 2 {
					addRequest(p, "changyan", fmt.Sprintf(fmtStr, regxpArrag[1]), "", s.Text())
				}
			}
		})
	}

	if p.GetUrlTag() == "changyan" {
		jsonMap := ChangyanJson{}
		err := json.NewDecoder(strings.NewReader(p.GetBodyStr())).Decode(&jsonMap)
		if err == nil {
			content, ok := p.GetRequest().GetMeta().(string)
			if ok {
				fmt.Println("Title:", content, " CommentCount:", jsonMap.ListData.OuterCmtSum, " ParticipationCount:", jsonMap.ListData.ParticipationSum)
			}
		}
	}
}

开发者ID:CrocdileChan，项目名称:go_spider，代码行数:38，代码来源:main.go

示例10: Process

func (this SitePageProcesser) Process(p *page.Page) {
	fmt.Println("Site Page Processer")

	if p.GetUrlTag() == "index" {
		query := p.GetHtmlParser()
		query.Find("ul[class='audioList fontYaHei'] li a").Each(func(i int, s *goquery.Selection) {
			strTitle, _ := s.Attr("title")
			strUrl, _ := s.Attr("data-url")

			if !IsFileExist(strTitle) {
				strFileName := fmt.Sprintf("%s.mp3", strTitle)
				fmt.Println(strFileName)
				cmd := exec.Command("/usr/local/bin/wget", strUrl, "-O", strFileName)
				err := cmd.Run()
				if err != nil {
					fmt.Println(err)
				}
				d, _ := cmd.Output()
				fmt.Println(string(d))
			}
		})
	}
}

开发者ID:rpoverflow，项目名称:LiZhiFMCrawler，代码行数:23，代码来源:sitePageProcess.go

示例11: Process

// Parse html dom here and record the parse result that we want to Page.
// Package goquery (http://godoc.org/github.com/PuerkitoBio/goquery) is used to parse html.
func (this *MyPageProcesser) Process(p *page.Page) {
	if !p.IsSucc() {
		println(p.Errormsg())
		return
	}
	var fetch_content string
	query := p.GetHtmlParser()
	content := p.GetBodyStr()
	reg := regexp.MustCompile(`class="([0-9a-zA-Z_-]*content[0-9a-zA-Z_-]*)"`)
	reg_res := reg.FindAllStringSubmatch(content, -1)
	class_content := make([]string, 0)
	for _, class := range reg_res {
		submatch := class[1]
		class_content = append(class_content, submatch)
	}
	removeDuplicate(&class_content)

	for _, class := range class_content {

		query.Find("." + class).Each(func(i int, s *goquery.Selection) {
			text := strings.Trim(s.Text(), " \t\n")
			text = strings.Replace(text, " ", "", -1)
			text = strings.Replace(text, "\n", "", -1)
			text = strings.Replace(text, "\t", "", -1)

			if text != "" {
				fetch_content = fetch_content + text
			}
		})
	}

	if fetch_content != "" {
		p.AddField("content", fetch_content)
	}

}

开发者ID:wadee，项目名称:go_proj，代码行数:38，代码来源:website_crawler.go

注：本文中的github.com/hu17889/go_spider/core/common/page.Page.GetHtmlParser方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。